In [43]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import pandas as pd

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, TensorDataset

data = pd.read_csv(r"./Tweets.csv")
data.head(5)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [44]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["sentiment"], test_size=0.2, stratify=data["sentiment"])

len(X_train), len(X_test)

(21984, 5497)

In [45]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
    return tokens

X_train = X_train.fillna("").apply(preprocess)
X_test = X_test.fillna("").apply(preprocess)
X_train[:5]

24851    [good, work.i, `, managed, turn, studio, ..., ...
5959     [happy, mother, `, day, ~, single, dad, play, ...
10190    [suck, ..., p, ?, ?, ?, ï¿½n, ?, eleg, c, ?, j...
334                                   [miss, daddy, mommy]
23211    [hey, twit, !, watched, poor, holly, britain, ...
Name: text, dtype: object

In [46]:
all_tokens = [token for tokens in X_train for token in tokens]

vocab = {token: idx+2 for idx, (token, _) in enumerate(Counter(all_tokens).most_common())} # BOW
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1
vocab_size = len(vocab)

def encode(text):
    return [vocab.get(token, vocab["<UNK>"]) for token in text]

X_train = [encode(text) for text in X_train]
X_test= [encode(text) for text in X_test]

In [47]:
block_size = 128

def pad_sequences(seqs, pad_value=0, block_size=128):
    padded = []
    attn_masks = []

    for s in seqs:
        s = torch.tensor(s)

        if len(s) > block_size:
            s = s[:block_size]

        pad_len = block_size - len(s)
        s_padded = F.pad(s, (0, pad_len), value=pad_value)
        padded.append(s_padded)

        mask = torch.cat([torch.ones(len(s)), torch.zeros(pad_len)])
        attn_masks.append(mask)

    padded_seqs = torch.stack(padded)
    attn_masks = torch.stack(attn_masks).int()

    padded_seqs = padded_seqs.long()
    attn_masks = attn_masks.long()

    return {
        "seqs": padded_seqs,
        "attn_masks": attn_masks
    }

X_train = pad_sequences(X_train)
X_test = pad_sequences(X_test)

X_train["seqs"][0], X_train["attn_masks"][0]

(tensor([  12, 8229,    4, 1747,  398, 1176,    8, 3071, 5525,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [48]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train["seqs"], X_train["attn_masks"], y_train)
test_dataset = TensorDataset(X_test["seqs"], X_test["attn_masks"], y_test)

In [49]:
# Hyperparams
batch_size = 16 # how many independent sequences will we process in parallel?
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
eval_iters = 100
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.2

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attn_mask):
        B, T, C = x.shape

        k = self.key(x)
        q = self.query(x)
        
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(attn_mask.unsqueeze(1) == 0, 1e-9)
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd) # Projection back into residual pathway
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, attn_mask):
        out = torch.cat([h(x, attn_mask) for h in self.heads], dim=-1) # from (B, T, head_size) to (B, T, C)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd), # Projection back into residual pathway
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd) # Unit gaussian distribution across layers at init
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x, attn_mask):
        # x + are residual connections, helps training
        x = x + self.sa(self.ln1(x), attn_mask) # Different from original paper, prenorm
        x = x + self.ffwd(self.ln2(x))
        return x

class Transformer(nn.Module):

    def __init__(self):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.ModuleList([Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # Final layer norm
        self.classifier = nn.Linear(n_embd, len(le.classes_)) # Into 3 classes

    def forward(self, idx, attn_mask, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        for block in self.blocks:
            x = block(x, attn_mask)
        x = self.ln_f(x) # (B,T,C)
        
        cls_token = x.mean(dim=1)             # (B, C)
        logits = self.classifier(cls_token)   # (B, 3)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def predict(self, seq):
        logits, _ = self(seq)
        probs = F.softmax(logits, dim=-1) # (B, C)
        classification = torch.multinomial(probs, num_samples=1) # (B, 1)
        return classification

In [50]:
model = Transformer()
model = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e3, 'K parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
train_iter = iter(train_dataloader)

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()

    train_iter = iter(train_dataloader)
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        try:
            X_seq, X_attn_mask, Y = next(train_iter)
        except StopIteration:
            train_iter = iter(train_dataloader)
            X_seq, X_attn_mask, Y = next(train_iter)

        X_seq, X_attn_mask, Y = X_seq.to(device), X_attn_mask.to(device), Y.to(device)
        logits, loss = model(X_seq, X_attn_mask, Y)
        losses[k] = loss.item()
    out["train"] = losses.mean()

    test_iter = iter(test_dataloader)
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        try:
            X_seq, X_attn_mask, Y = next(test_iter)
        except StopIteration:
            test_iter = iter(test_dataloader)
            X_seq, X_attn_mask, Y = next(test_iter)

        X_seq, X_attn_mask, Y = X_seq.to(device), X_attn_mask.to(device), Y.to(device)
        logits, loss = model(X_seq, X_attn_mask, Y)
        losses[k] = loss.item()
    out["test"] = losses.mean()

    model.train()
    return out

def test(model, dataloader):
    size = len(dataloader.dataset)
    
    model.eval()
    correct = 0

    with torch.no_grad():
        for X_seq, X_attn_mask, Y in dataloader:
            X_seq, X_attn_mask, Y = X_seq.to(device), X_attn_mask.to(device), Y.to(device)
            logits, _ = model(X_seq, X_attn_mask)
            correct += (logits.argmax(1) == Y).type(torch.float).sum().item()

    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%")

1666.499 K parameters


In [51]:
for i in range(max_iters):
    if i % eval_interval == 0 or i == max_iters - 1:
        losses = estimate_loss(model)
        print(f"step {i}: train loss {losses['train']:.4f}, test loss {losses['test']:.4f}")

    # get next batch using persistent iterator
    try:
        X_seq, X_attn_mask, Y = next(train_iter)
    except StopIteration:
        train_iter = iter(train_dataloader)
        X_seq, X_attn_mask, Y = next(train_iter)

    X_seq, X_attn_mask, Y = X_seq.to(device), X_attn_mask.to(device), Y.to(device)

    logits, loss = model(X_seq, X_attn_mask, Y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

test(model, test_dataloader)

step 0: train loss 1.1396, test loss 1.1343
step 500: train loss 0.9829, test loss 1.0054
step 1000: train loss 0.8584, test loss 0.9271
step 1500: train loss 0.8426, test loss 0.8219
step 2000: train loss 0.7940, test loss 0.8895
step 2500: train loss 0.7448, test loss 0.7857
step 3000: train loss 0.7375, test loss 0.8767
step 3500: train loss 0.6436, test loss 0.8225
step 4000: train loss 0.6021, test loss 0.7658
step 4500: train loss 0.5713, test loss 0.8270
step 4999: train loss 0.5748, test loss 0.7566
Test Error: 
 Accuracy: 68.4%


In [52]:
text = input("text: ")
text = pad_sequences([encode(text)])

logits, _ = model(text["seqs"].to(device), text["attn_masks"].to(device))
cls = logits.argmax(1).item()
f"Sentiment: {le.inverse_transform([cls])[0]}"

'Sentiment: neutral'