In [7]:
import os
import re
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cpu')

In [8]:
DATA_PATH = "training.1600000.processed.noemoticon.csv"  # зміни на свій шлях

# Для швидкого експерименту можна взяти підмножину даних:
N_SAMPLES = 200_000   # або менше/більше залежно від ресурсів

# Датасет без хедера, 6 колонок, роздільник - кома
df = pd.read_csv(
    DATA_PATH,
    encoding="latin-1",
    header=None,
    names=["sentiment", "id", "date", "query", "user", "text"]
)

if N_SAMPLES is not None and N_SAMPLES < len(df):
    df = df.sample(N_SAMPLES, random_state=SEED).reset_index(drop=True)
    
df["label"] = df["sentiment"].apply(lambda x: 1 if x == 4 else 0)
df = df[["text", "label"]]

df["label"].value_counts(), df.head()

print(df.head())
print("Shape:", df.shape)


                                                text  label
0             @chrishasboobs AHHH I HOPE YOUR OK!!!       0
1  @misstoriblack cool , i have no tweet apps  fo...      0
2  @TiannaChaos i know  just family drama. its la...      0
3  School email won't open  and I have geography ...      0
4                             upper airways problem       0
Shape: (200000, 2)


In [9]:
URL_RE = re.compile(r"http[s]?://\S+")
MENTION_RE = re.compile(r"@\w+")
NONALPHANUM_RE = re.compile(r"[^a-z0-9]+")
MULTISPACE_RE = re.compile(r"\s+")

def clean_text(text: str) -> str:
    text = text.lower()
    text = URL_RE.sub(" URL ", text)
    text = MENTION_RE.sub(" USER ", text)
    text = NONALPHANUM_RE.sub(" ", text)
    text = MULTISPACE_RE.sub(" ", text).strip()
    return text

df["clean"] = df["text"].astype(str).apply(clean_text)
df[["text", "clean"]].head()


Unnamed: 0,text,clean
0,@chrishasboobs AHHH I HOPE YOUR OK!!!,ahhh i hope your ok
1,"@misstoriblack cool , i have no tweet apps fo...",cool i have no tweet apps for my razr 2
2,@TiannaChaos i know just family drama. its la...,i know just family drama its lame hey next tim...
3,School email won't open and I have geography ...,school email won t open and i have geography s...
4,upper airways problem,upper airways problem


In [12]:
from collections import Counter

MAX_VOCAB_SIZE = 50000
MIN_FREQ = 2

counter = Counter()
for text in df["clean"]:
    tokens = text.split()
    counter.update(tokens)

most_common = [w for w, c in counter.items() if c >= MIN_FREQ]
most_common = most_common[:MAX_VOCAB_SIZE]

# зарезервуємо індекси
PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"

itos = [PAD_TOKEN, UNK_TOKEN] + most_common
stoi = {w: i for i, w in enumerate(itos)}

vocab_size = len(itos)
vocab_size
MAX_LEN = 40  # можна змінити

def encode_text(text: str, max_len: int = MAX_LEN):
    tokens = text.split()
    ids = [stoi.get(tok, stoi[UNK_TOKEN]) for tok in tokens]
    if len(ids) < max_len:
        ids = ids + [stoi[PAD_TOKEN]] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

df["input_ids"] = df["clean"].apply(lambda t: encode_text(t, MAX_LEN))

df[["clean", "input_ids", "label"]].head()



Unnamed: 0,clean,input_ids,label
0,ahhh i hope your ok,"[2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,cool i have no tweet apps for my razr 2,"[7, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, ...",0
2,i know just family drama its lame hey next tim...,"[3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26...",0
3,school email won t open and i have geography s...,"[39, 40, 41, 42, 43, 44, 3, 8, 45, 46, 47, 48,...",0
4,upper airways problem,"[52, 53, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [13]:
X = np.stack(df["input_ids"].values)
y = df["label"].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=SEED, stratify=y
)

X_train.shape, X_val.shape
class TweetDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

BATCH_SIZE = 256

train_ds = TweetDataset(X_train, y_train)
val_ds = TweetDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)


In [14]:
def accuracy_from_logits(logits, y_true):
    preds = torch.argmax(logits, dim=1)
    correct = (preds == y_true).sum().item()
    total = y_true.size(0)
    return correct / total

def train_one_epoch(model, optimizer, criterion, dataloader, device):
    model.train()
    total_loss = 0.0
    total_acc = 0.0
    total_count = 0

    for X_batch, y_batch in tqdm(dataloader, leave=False):
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()

        batch_size = y_batch.size(0)
        total_loss += loss.item() * batch_size
        total_acc += accuracy_from_logits(logits, y_batch) * batch_size
        total_count += batch_size

    return total_loss / total_count, total_acc / total_count

def evaluate(model, criterion, dataloader, device):
    model.eval()
    total_loss = 0.0
    total_acc = 0.0
    total_count = 0

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(X_batch)
            loss = criterion(logits, y_batch)

            batch_size = y_batch.size(0)
            total_loss += loss.item() * batch_size
            total_acc += accuracy_from_logits(logits, y_batch) * batch_size
            total_count += batch_size

    return total_loss / total_count, total_acc / total_count


In [None]:
class SentimentLSTM(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim=100,
        hidden_dim=128,
        num_layers=1,
        num_classes=2,
        bidirectional=True,
        dropout=0.3,
        padding_idx=0,
    ):
        super().__init__()
        self.embedding = nn.Embedding(
            vocab_size, embed_dim, padding_idx=padding_idx
        )
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0,
        )
        self.dropout = nn.Dropout(dropout)
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim

        out_dim = hidden_dim * (2 if bidirectional else 1)
        self.fc = nn.Linear(out_dim, num_classes)

    def forward(self, x):
        # x: (batch, seq_len)
        emb = self.embedding(x)  # (batch, seq_len, embed_dim)
        output, (h_n, c_n) = self.lstm(emb)
        # беремо останній hidden state (з урахуванням бідірекшну)
        if self.bidirectional:
            # конкатенація останніх forward і backward
            h_last = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h_last = h_n[-1]
        h_last = self.dropout(h_last)
        logits = self.fc(h_last)
        return logits

model_a = SentimentLSTM(
    vocab_size=vocab_size,
    embed_dim=100,     
    hidden_dim=128,
    num_layers=1,
    bidirectional=True,
    padding_idx=stoi[PAD_TOKEN]
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer_a = torch.optim.Adam(model_a.parameters(), lr=1e-3)

model_a


SentimentLSTM(
  (embedding): Embedding(33235, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

In [None]:
N_EPOCHS = 5  

history_a = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}

for epoch in range(1, N_EPOCHS + 1):
    print(f"Epoch {epoch}/{N_EPOCHS} [Random Embedding + LSTM]")
    train_loss, train_acc = train_one_epoch(model_a, optimizer_a, criterion, train_loader, device)
    val_loss, val_acc = evaluate(model_a, criterion, val_loader, device)

    history_a["train_loss"].append(train_loss)
    history_a["train_acc"].append(train_acc)
    history_a["val_loss"].append(val_loss)
    history_a["val_acc"].append(val_acc)

    print(f"  Train: loss={train_loss:.4f}, acc={train_acc:.4f}")
    print(f"  Val  : loss={val_loss:.4f}, acc={val_acc:.4f}")


Epoch 1/5 [Random Embedding + LSTM]


                                                 

  Train: loss=0.5386, acc=0.7277
  Val  : loss=0.4640, acc=0.7791
Epoch 2/5 [Random Embedding + LSTM]


                                                 

  Train: loss=0.4377, acc=0.7976
  Val  : loss=0.4358, acc=0.7963
Epoch 3/5 [Random Embedding + LSTM]


                                                 

  Train: loss=0.3943, acc=0.8217
  Val  : loss=0.4293, acc=0.8024
Epoch 4/5 [Random Embedding + LSTM]


                                                 

  Train: loss=0.3631, acc=0.8387
  Val  : loss=0.4270, acc=0.8040
Epoch 5/5 [Random Embedding + LSTM]


                                                 

  Train: loss=0.3292, acc=0.8579
  Val  : loss=0.4349, acc=0.8054


In [None]:
GLOVE_PATH = "glove.6B.100d.txt"  
EMBED_DIM = 100                    

embeddings_index = {}

with open(GLOVE_PATH, encoding="utf-8") as f:
    for line in tqdm(f, desc="Loading GloVe"):
        values = line.rstrip().split(" ")
        word = values[0]
        try:
            vector = np.asarray(values[1:], dtype="float32")
        except ValueError:
            continue
        embeddings_index[word] = vector

len(embeddings_index)


Loading GloVe: 400000it [00:07, 53913.88it/s]


400000

In [18]:
embedding_matrix = np.random.normal(
    scale=0.6, size=(vocab_size, EMBED_DIM)
).astype("float32")

# зробимо паддінг-рядок нульовим
embedding_matrix[stoi[PAD_TOKEN]] = np.zeros(EMBED_DIM, dtype="float32")

oov_count = 0
for word, idx in stoi.items():
    if word in embeddings_index:
        embedding_matrix[idx] = embeddings_index[word]
    else:
        oov_count += 1

print("OOV words:", oov_count, "out of", vocab_size)


OOV words: 6151 out of 33235


In [None]:
model_b = SentimentLSTM(
    vocab_size=vocab_size,
    embed_dim=EMBED_DIM,
    hidden_dim=128,
    num_layers=1,
    bidirectional=True,
    padding_idx=stoi[PAD_TOKEN]
).to(device)

# ініціалізація embedding шару GloVe вагами
with torch.no_grad():
    model_b.embedding.weight.data.copy_(
        torch.from_numpy(embedding_matrix)
    )


FREEZE_EMBEDDINGS_FIRST = True

if FREEZE_EMBEDDINGS_FIRST:
    for param in model_b.embedding.parameters():
        param.requires_grad = False

criterion = nn.CrossEntropyLoss()
optimizer_b = torch.optim.Adam(filter(lambda p: p.requires_grad, model_b.parameters()), lr=1e-3)

model_b


SentimentLSTM(
  (embedding): Embedding(33235, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

In [None]:
N_EPOCHS_B = 5  

history_b = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}

for epoch in range(1, N_EPOCHS_B + 1):
    print(f"Epoch {epoch}/{N_EPOCHS_B} [GloVe Embedding + LSTM]")
    train_loss, train_acc = train_one_epoch(model_b, optimizer_b, criterion, train_loader, device)
    val_loss, val_acc = evaluate(model_b, criterion, val_loader, device)

    history_b["train_loss"].append(train_loss)
    history_b["train_acc"].append(train_acc)
    history_b["val_loss"].append(val_loss)
    history_b["val_acc"].append(val_acc)

    print(f"  Train: loss={train_loss:.4f}, acc={train_acc:.4f}")
    print(f"  Val  : loss={val_loss:.4f}, acc={val_acc:.4f}")




Epoch 1/5 [GloVe Embedding + LSTM]


                                                 

  Train: loss=0.5475, acc=0.7182
  Val  : loss=0.4963, acc=0.7589
Epoch 2/5 [GloVe Embedding + LSTM]


                                                 

  Train: loss=0.4865, acc=0.7647
  Val  : loss=0.4655, acc=0.7759
Epoch 3/5 [GloVe Embedding + LSTM]


                                                 

  Train: loss=0.4621, acc=0.7818
  Val  : loss=0.4530, acc=0.7825
Epoch 4/5 [GloVe Embedding + LSTM]


                                                 

  Train: loss=0.4447, acc=0.7914
  Val  : loss=0.4414, acc=0.7912
Epoch 5/5 [GloVe Embedding + LSTM]


                                                 

  Train: loss=0.4312, acc=0.7998
  Val  : loss=0.4484, acc=0.7892
