In [1]:
import os
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


import nltk
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from collections import Counter


In [2]:
script_dir = os.getcwd()
path_df = os.path.join(script_dir, '../dataset/train_processed.csv')
df = pd.read_csv(path_df)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31014 entries, 0 to 31013
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    31010 non-null  object
 1   label   31014 non-null  object
dtypes: object(2)
memory usage: 484.7+ KB


In [4]:
df = df.dropna(subset=['text'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31010 entries, 0 to 31013
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    31010 non-null  object
 1   label   31010 non-null  object
dtypes: object(2)
memory usage: 726.8+ KB


## Data pre-processing

In [6]:
# Create vocab
nltk.data.path.append('../nltk_data')

all_tokens = [word for text in df.text for word in word_tokenize(text)]
vocab = {word : idx + 2 for idx, (word, _) in enumerate(Counter(all_tokens).most_common())}

vocab['<PAD>'] = 0 # khi padding
vocab['<UNK>'] = 1 # tu khong co trong vocab

In [7]:
# Encode text
def encode_text(text):
    return[vocab.get(word, vocab['<UNK>']) for word in word_tokenize(text)]

df['encoded'] = df.text.apply(encode_text)

In [8]:
# Encode label
label_map ={
    'positive' : 2,
    'neutral' : 1,
    'negative' : 0
}
df['label'] = df.label.map(label_map)

In [9]:
df.head(20)

Unnamed: 0,text,label,encoded
0,i would have respond if i be go,1,"[3, 67, 12, 1230, 65, 3, 2, 17]"
1,soo sad i will miss you here in san diego,0,"[148, 108, 3, 30, 55, 7, 84, 14, 1282, 1873]"
2,my bos be bully me,0,"[10, 1231, 2, 4108, 22]"
3,what interview leave me alone,0,"[47, 883, 128, 22, 450]"
4,son of why could not they put them on the rele...,0,"[662, 18, 111, 100, 9, 64, 255, 114, 21, 5, 90..."
5,some shameless plug for the best ranger forum ...,1,"[68, 9155, 1972, 15, 5, 150, 3564, 1613, 21, 1..."
6,numam feeding for the baby be fun when he be a...,2,"[357, 9156, 15, 5, 243, 2, 103, 77, 80, 2, 32,..."
7,soo high,1,"[148, 458]"
8,both of you,1,"[378, 18, 7]"
9,journey wow you just become cool hehe be that ...,2,"[1874, 241, 7, 26, 599, 167, 474, 2, 19, 765]"


## Create Dataset & DataLoader

In [10]:
class Text_Dataset(Dataset):
    def __init__(self, df):
        # Luu list cac cau encoded
        self.texts = df.encoded.tolist()
        # Luu list cac nhan tuong ung
        self.labels = df.label.tolist()

    def __len__(self):
        # Tra ve tong so sample trong dataset, can cho DataLoader
        return len(self.texts)
    
    def __getitem__(self, idx):
        # Tra ve 1 sample
        return torch.tensor(self.texts[idx]), torch.tensor(self.labels[idx])

In [11]:
def collate_fn(batch):
    # batch la 1 list gom cac sample tu Dataset
    # Tach batch thanh 2 tupe: texts & labels
    texts, labels = zip(*batch)

    # pad_sequence tu dong them <PAD> vao cac cau ngan hon trong batch = len(cau dai nhat)
    # batch_first = True -> output có shape [batch_size, seq_len]
    # padding_value = vocab['<PAD>'] -> index cua token<PAD> = 0
    texts_padded = pad_sequence(texts,batch_first=True, padding_value=vocab['<PAD>'])

    # chuyen labels sang tensor [batch_size]
    return texts_padded, torch.tensor(labels)

In [12]:
# Splitting Train  & val Data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


# Create Dataloader
train_loader = DataLoader(Text_Dataset(train_df), 
                          batch_size=32, # Moi batch 32 cau
                          shuffle=True, # Xoa tron du lieu sau moi epoch
                          collate_fn=collate_fn) # Ham padding

val_loader = DataLoader(Text_Dataset(val_df), 
                        batch_size= 32, 
                        shuffle=True,
                        collate_fn=collate_fn)


In [13]:
Text_Dataset(train_df).__getitem__(0)

(tensor([  3,  13,   9, 227,  99, 285,  26,  34, 162, 174,  15,  32,   5, 631,
           3,  59,  59, 670,   8, 602]),
 tensor(2))

In [14]:
for batch_texts, batch_labels in train_loader:
    print(batch_texts.shape)
    print(batch_labels.shape)
    print('\n')

torch.Size([32, 24])
torch.Size([32])


torch.Size([32, 25])
torch.Size([32])


torch.Size([32, 28])
torch.Size([32])


torch.Size([32, 27])
torch.Size([32])


torch.Size([32, 28])
torch.Size([32])


torch.Size([32, 26])
torch.Size([32])


torch.Size([32, 28])
torch.Size([32])


torch.Size([32, 20])
torch.Size([32])


torch.Size([32, 30])
torch.Size([32])


torch.Size([32, 29])
torch.Size([32])


torch.Size([32, 28])
torch.Size([32])


torch.Size([32, 27])
torch.Size([32])


torch.Size([32, 27])
torch.Size([32])


torch.Size([32, 27])
torch.Size([32])


torch.Size([32, 31])
torch.Size([32])


torch.Size([32, 31])
torch.Size([32])


torch.Size([32, 30])
torch.Size([32])


torch.Size([32, 29])
torch.Size([32])


torch.Size([32, 26])
torch.Size([32])


torch.Size([32, 27])
torch.Size([32])


torch.Size([32, 25])
torch.Size([32])


torch.Size([32, 31])
torch.Size([32])


torch.Size([32, 30])
torch.Size([32])


torch.Size([32, 29])
torch.Size([32])


torch.Size([32, 28])
torch.Size([32])




## Build model CNN

In [15]:
class CNN_Text(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(CNN_Text,self).__init__()

        # Embedding layer 
        self.embedding = nn.Embedding( num_embeddings= vocab_size, 
                                      embedding_dim=embed_dim, 
                                      padding_idx=vocab['<PAD>'])  # padding_idx = index của <PAD>, embeddings cho PAD se khong duoc hoc
        
        # Convolution layer voi cac kernel size khac nhau (3, 4, 5)
        self.conv1 = nn.Conv1d(in_channels=embed_dim, out_channels=100, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=embed_dim, out_channels=100, kernel_size=4)
        self.conv3 = nn.Conv1d(in_channels=embed_dim, out_channels=100, kernel_size=5)

        # Dropout de tranh overfiting
        self.dropout = nn.Dropout(0.5)

        # Fully connected layer: 3 conv * 100 filters
        self.fc = nn.Linear(3 * 100, num_classes) 

    def forward(self,x):
        # x : [batch_size, seq_len]

        x = self.embedding(x) # [batch_size, seq_len, embed_dim]
        x = x.permute(0, 2, 1) # [batch_size, embed_dim, seq_len]

        # Apply convolution + ReLU
        x1 = F.relu(self.conv1(x)) # [batch_size, 100, seq_len-3+1]
        x2 = F.relu(self.conv2(x)) # [batch_size, 100, seq_len-4+1]
        x3 = F.relu(self.conv3(x)) # [batch_size, 100, seq_len-5+1]

        # Max pooling tren toan bo seq_len
        x1 = F.max_pool1d(x1, kernel_size= x1.shape[2]).squeeze(2) # [batch_size, 100]
        x2 = F.max_pool1d(x2, kernel_size= x2.shape[2]).squeeze(2) # [batch_size, 100]
        x3 = F.max_pool1d(x3, kernel_size= x3.shape[2]).squeeze(2) # [batch_size, 100]

        # Concatenate cac feature map
        x = torch.cat((x1, x2, x3), dim=1) # [batch_size, 300]
        x = self.dropout(x)

        # Full connected
        return self.fc(x)
        

## Trainning model

In [16]:
# select GPU if not select CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Khoi tao model
model = CNN_Text(vocab_size=len(vocab), # kich thuoc tu vung
                 embed_dim=300, # so chieu cua vector embedding cho moi tu
                 num_classes=3).to(device=device) # so label

# Khai bao optimizer & loss function
optimizer = optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss() 

for epoch in range(5):
    model.train() # dat model ow che do train
    total_loss = 0 # luu tong loss trong epoch

    # lap qua tung batch du lieu tu DataLoader
    for texts, labels in train_loader:

        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad() # reset gradient cu

        outputs = model(texts) # Forward: tinh dau ra du doan cua model
        loss = criterion(outputs, labels) # so sanh du doan voi label that -> tinh loss

        loss.backward() # lan truyen nguoc
        optimizer.step() # cap nhat trong so dua tren gradient

        total_loss += loss.item() # cong don loss

    print(f'Epoch {epoch + 1}, Loss : {total_loss / len(train_loader): .4f}')


Epoch 1, Loss :  0.9381
Epoch 2, Loss :  0.7596
Epoch 3, Loss :  0.6468
Epoch 4, Loss :  0.5406
Epoch 5, Loss :  0.4372


## Predict & Evaluation

In [17]:
model.eval() # dat model o che do danh gia 

preds, trues = [], [] # tao 2 list de luu ket qua va label that

# Tat tinh toan gradient
with torch.no_grad():
    for texts, labels in val_loader:
        texts, labels = texts.to(device), labels.to(device)

        # Du doan dau ra cua model
        outputs = model(texts)

        # Lay class co xac suat cao nhat tu moi dong cua output, argmax theo chieu 1
        preds.extend(outputs.argmax(1).cpu().numpy())
        trues.extend(labels.cpu().numpy())

print(accuracy_score(trues,preds))
print(classification_report(trues,preds, target_names=['negative', 'neutral','positive']))
        

0.6660754595291841
              precision    recall  f1-score   support

    negative       0.73      0.54      0.62      1774
     neutral       0.60      0.73      0.66      2510
    positive       0.73      0.70      0.72      1918

    accuracy                           0.67      6202
   macro avg       0.69      0.66      0.66      6202
weighted avg       0.68      0.67      0.67      6202

