In [22]:
import json
import re
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tqdm
from torch.nn.utils.rnn import pad_sequence
import jieba


In [23]:
class HumorDataset(Dataset):
    def __init__(self, file_path, tokenizer):
        self.data = []
        self.tokenizer = tokenizer
        self.vocab = {}
        self.vocab_size = 0
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                item = json.loads(line)
                sentence = item['sentence']
                tokens = tokenizer(sentence)
                label = item['label'][0]
                self.data.append((tokens, label))
                for token in tokens:
                    if token not in self.vocab:
                        self.vocab[token] = len(self.vocab)
        self.vocab_size = len(self.vocab)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        tokens, label = self.data[idx]
        token_ids = [self.vocab[token] for token in tokens]
        return token_ids, label
def generate_offsets(batch):
    offsets = [0]
    for tokens in batch:
        offsets.append(offsets[-1] + len(tokens))
    return offsets[:-1]
def collate_fn(batch):
    tokens, labels = zip(*batch)
    token_ids = [torch.tensor(ids) for ids in tokens]
    token_ids = pad_sequence(token_ids, batch_first=True, padding_value=0)  # Padding with 0
    token_ids = token_ids.view(-1)
    offsets = [0] + [len(ids) for ids in tokens]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    labels = torch.tensor(labels)
    return token_ids, offsets, labels



In [24]:
class HumorClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim1, hidden_dim2, output_dim):
        super(HumorClassifier, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim1),
            nn.ReLU(),
            nn.Linear(hidden_dim1, hidden_dim2),
            nn.ReLU(),
            nn.Linear(hidden_dim2, output_dim)
        )

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)


In [None]:
# use jieba as the tokenizer

train_dataset2 = HumorDataset('/Users/earendelh/Documents/Sophomore_Second/NLP/Ass1/train.jsonl', jieba._lcut_for_search)
test_dataset2 = HumorDataset('/Users/earendelh/Documents/Sophomore_Second/NLP/Ass1/test.jsonl', jieba._lcut_for_search)
train_loader2 = DataLoader(train_dataset2, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader2 = DataLoader(test_dataset2, batch_size=32, shuffle=False, collate_fn=collate_fn)

vocab_size = train_dataset2.vocab_size
embed_dim = 64
hidden_dim1 = 128
hidden_dim2 = 64
output_dim = 2

model2 = HumorClassifier(vocab_size, embed_dim, hidden_dim1, hidden_dim2, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model2.parameters(), lr=0.02)

num_epochs = 20
for epoch in tqdm.tqdm(range(num_epochs)):
    model2.train()
    for token_ids, offsets, labels in train_loader2:
        optimizer.zero_grad()
        outputs = model2(token_ids, offsets)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

model2.eval()
all_labels2 = []
all_preds2 = []
with torch.no_grad():
    for token_ids, offsets, labels in test_loader2:
        outputs = model2(token_ids, offsets)
        _, predicted = torch.max(outputs.data, 1)
        all_labels2.extend(labels.tolist())
        all_preds2.extend(predicted.tolist())

accuracy_jieba = accuracy_score(all_labels2, all_preds2)
precision_jieba = precision_score(all_labels2, all_preds2, average='weighted')
recall_jieba = recall_score(all_labels2, all_preds2, average='weighted')
f1_jieba = f1_score(all_labels2, all_preds2, average='weighted')

print(f'Jieba Accuracy: {accuracy_jieba:.4f}, Precision: {precision_jieba:.4f}, Recall: {recall_jieba:.4f}, F1: {f1_jieba:.4f}')

  5%|▌         | 1/20 [00:00<00:17,  1.11it/s]

Epoch [1/20], Loss: 0.7217


 10%|█         | 2/20 [00:01<00:12,  1.46it/s]

Epoch [2/20], Loss: 0.3473


 15%|█▌        | 3/20 [00:01<00:10,  1.63it/s]

Epoch [3/20], Loss: 0.6889


 20%|██        | 4/20 [00:02<00:09,  1.71it/s]

Epoch [4/20], Loss: 0.7077


 25%|██▌       | 5/20 [00:03<00:08,  1.76it/s]

Epoch [5/20], Loss: 0.3562


 30%|███       | 6/20 [00:03<00:07,  1.80it/s]

Epoch [6/20], Loss: 0.5179


 35%|███▌      | 7/20 [00:04<00:07,  1.83it/s]

Epoch [7/20], Loss: 0.5290


 40%|████      | 8/20 [00:04<00:06,  1.80it/s]

Epoch [8/20], Loss: 0.3229


 45%|████▌     | 9/20 [00:05<00:06,  1.81it/s]

Epoch [9/20], Loss: 0.3307


 50%|█████     | 10/20 [00:05<00:06,  1.65it/s]

Epoch [10/20], Loss: 0.6724


 55%|█████▌    | 11/20 [00:06<00:05,  1.66it/s]

Epoch [11/20], Loss: 0.4987


 60%|██████    | 12/20 [00:07<00:05,  1.58it/s]

Epoch [12/20], Loss: 0.3323


 65%|██████▌   | 13/20 [00:08<00:04,  1.49it/s]

Epoch [13/20], Loss: 0.4882


 70%|███████   | 14/20 [00:08<00:03,  1.57it/s]

Epoch [14/20], Loss: 0.5452


 75%|███████▌  | 15/20 [00:09<00:03,  1.64it/s]

Epoch [15/20], Loss: 0.6811


 80%|████████  | 16/20 [00:09<00:02,  1.71it/s]

Epoch [16/20], Loss: 0.3560


 85%|████████▌ | 17/20 [00:10<00:01,  1.72it/s]

Epoch [17/20], Loss: 0.4956


 90%|█████████ | 18/20 [00:10<00:01,  1.70it/s]

Epoch [18/20], Loss: 0.5161


 95%|█████████▌| 19/20 [00:11<00:00,  1.70it/s]

Epoch [19/20], Loss: 0.9057


100%|██████████| 20/20 [00:11<00:00,  1.68it/s]

Epoch [20/20], Loss: 0.4932
Jieba Accuracy: 0.7389, Precision: 0.5459, Recall: 0.7389, F1: 0.6279



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
