In [16]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import pandas as pd

import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

data = pd.read_csv(r"./Tweets.csv")
data.head(5)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [17]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["sentiment"], test_size=0.2, stratify=data["sentiment"])

len(X_train), len(X_test)

(21984, 5497)

In [18]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
    return tokens

X_train = X_train.fillna("").apply(preprocess)
X_test = X_test.fillna("").apply(preprocess)
X_train[:5]

14556    [mean, backround, .., setting, >, design, .., ...
18218                                        [_argie, yes]
7360     [roscoe, -, smooth, sailin, `, one, tell, song...
23083                                         [feel, good]
10886    [done, mother, `, day, ?, ', *, people, day, b...
Name: text, dtype: object

In [None]:
all_tokens = [token for tokens in X_train for token in tokens]

vocab = {token: idx+2 for idx, (token, _) in enumerate(Counter(all_tokens).most_common())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1
vocab_size = len(vocab)

def encode(text):
    return [vocab.get(token, vocab["<UNK>"]) for token in text]

X_train_seq = [encode(text) for text in X_train]
X_test_seq = [encode(text) for text in X_test]

X_test_seq[:5]

['mean',
 'backround',
 '..',
 'setting',
 '>',
 'design',
 '..',
 'scroll',
 'bottem',
 'click',
 'change',
 'backround',
 'image',
 '&',
 'ya',
 'go',
 '.',
 '_argie',
 'yes',
 'roscoe',
 '-',
 'smooth',
 'sailin',
 '`',
 'one',
 'tell',
 'song',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 'rap',
 'friend',
 'u',
 'failed',
 'feel',
 'good',
 'done',
 'mother',
 '`',
 'day',
 '?',
 "'",
 '*',
 'people',
 'day',
 'behind',
 'u',
 '*',
 "'what",
 '?',
 "'",
 'took',
 'mom',
 'starbucks',
 ',',
 'enjoyed',
 'nvr',
 'wanted',
 'may',
 'come',
 'end',
 'monday',
 'morning',
 '`',
 'exist',
 '`',
 'hot',
 'tea',
 '.',
 'great',
 'day',
 'massage',
 '!',
 'book',
 'appointment',
 'today',
 '617-262-2220',
 'close',
 'entering',
 'pogue',
 '`',
 'book',
 'today',
 "'",
 'world',
 'according',
 'twitter',
 "'",
 ',',
 'yet',
 'seem',
 'far.',
 '!',
 '!',
 '!',
 '!',
 '!',
 '!',
 '!',
 '!',
 '!',
 '!',
 '!',
 '!',
 '_x',
 'three',
 'first',
 '2',
 'week',
 'june',
 'im',
 'soooo',
 'intrigued',
 '!',

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

X_train_vectors = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_test_vectors = torch.tensor(X_test.toarray(), dtype=torch.float32)

y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_vectors, y_train)
test_dataset = TensorDataset(X_test_vectors, y_test)