Install

In [45]:
%pip install transformers
%pip install torch
%pip install underthesea


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Import

In [46]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader
from underthesea import word_tokenize
from transformers import get_linear_schedule_with_warmup, AutoTokenizer, AutoModel
import torch.nn.functional as F
from torchvision.transforms import ToTensor
from PIL import Image
import requests

Const

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 2
MAX_LEN = 256
BATCH_SIZE = 4
LR = 2e-4
# with open('drive/MyDrive/Colab_Notebooks/Foody_data/Stopwords.txt', 'r', encoding="utf-8") as f:
#     stop_set = set(m.strip() for m in f.readlines())
#     stopwords = list(frozenset(stop_set))

torch.cuda.empty_cache()
print(device)
# df = pd.read_csv('/kaggle/input/int3405-sentiment-analysis-problem/full_train.csv')

cuda


Dataset and Models

In [48]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
#         self.stopwords = stopwords
    
    def __len__(self):
        return len(self.df)

    def preprocess(self, s):
        s = str(s)
        s = s.lower()
        s = ''.join(e for e in s if e.isalnum() or e == ' ')
        return word_tokenize(s, format='text')
        # return ' '.join(e for e in s.split(' ') if e not in self.stopwords)
        
    def get_image(self, url):
        return ToTensor()(Image.open(requests.get(url, stream=True).raw).resize((28, 28))).to(device)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        comment = self.preprocess(row['Comment'])

        encoding = self.tokenizer.encode_plus(
            comment,
            truncation=True,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
        )
        
        return {
            'comment': comment,
            'input_ids': torch.flatten(encoding['input_ids']).to(device=device),
            'attention_masks': torch.flatten(encoding['attention_mask']).to(device=device),
            'targets': torch.tensor(row['Rating'], dtype=torch.long).to(device),
            "images": self.get_image(row['image_urls'][2:-2].split("', '")[0]),
        } if self.df.shape[1] == 6 else {
            'comment': comment,
            'input_ids': torch.flatten(encoding['input_ids']).to(device=device),
            'attention_masks': torch.flatten(encoding['attention_mask']).to(device=device),
            "images": self.get_image(row['image_urls'][2:-2].split("', '")[0]),
        }

class MyModel(torch.nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        self.drop = nn.Dropout(p=0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input):
        _, output = self.bert(
            input_ids=input['input_ids'],
            attention_mask=input['attention_masks'],
            return_dict=False
        )

        x = self.drop(output)
        x = self.fc(x)
        return x

class Basic_CNN_Module(nn.Module):
    def __init__(self) -> None:
        super(Basic_CNN_Module, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, 1, 1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(32, 64, 3, 1, 1)
        self.relu2 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(3136, 128)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(128, 2)

    def forward(self, input):
        x = input['images']
        x = self.conv1(x)
        x = F.max_pool2d(x, 2)
        x = self.relu1(x)
        
        x = self.conv2(x)
        x = F.max_pool2d(x, 2)
        x = self.relu2(x)
        x = self.dropout1(x)
        
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

Functions

In [49]:
def train(model, img_model, loss_func, img_loss_func, optimizer, img_optimizer, data_loader, lr_scheduler, epoch):
    print(f'Epoch {epoch}: ')
    model.train()
    img_model.train()
    losses = []
    correct = 0

    for input in data_loader:
        optimizer.zero_grad()
        img_optimizer.zero_grad()
        outputs = model(input)
        img_outputs = img_model(input) #img

        loss = loss_func(outputs, input['targets'])
        img_loss = img_loss_func(img_outputs, input['targets']) #img
        # pred = torch.round((2*torch.max(outputs, dim=1)[1] + torch.max(img_outputs, dim=1)[1])/3)
        _, pred = torch.max(2*outputs + img_outputs, dim=1)

        correct += torch.sum(pred == input['targets'])
        losses.append(loss.item() + img_loss.item())
        loss.backward()
        img_loss.backward() #img
        img_optimizer.step() #img
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr_scheduler.step()

    print(f'Train Accuracy: {correct.double()/len(data_loader.dataset)} Loss: {np.mean(losses)}')

def test(model, img_model, data_loader, loss_func, img_loss_func):
    model.eval()
    img_model.eval()
    losses = []
    correct = 0

    with torch.no_grad():
        for input in data_loader:
            outputs = model(input)
            img_outputs = img_model(input) #img

            # pred = torch.round((2*torch.max(outputs, dim=1)[1] + torch.max(img_outputs, dim=1)[1])/3)
            _, pred = torch.max(2*outputs + img_outputs, dim=1)

            loss = loss_func(outputs, input['targets'])
            img_loss = img_loss_func(img_outputs, input['targets']) #img
            correct += torch.sum(pred == input['targets'])
            losses.append(loss.item() + img_loss.item())
    
        print(f'Test Accuracy: {correct.double()/len(data_loader.dataset)} Loss: {np.mean(losses)}')

def get_dataloader():
    df = pd.read_csv('Data/full_train.csv')
    df = df.dropna()
    # shuffle the DataFrame rows
    df = df.sample(frac = 1)
    train_df, test_df = train_test_split(df, train_size=0.7)
    train_dataloader = DataLoader(MyDataset(train_df), batch_size=BATCH_SIZE, shuffle=True)
    test_dataloader = DataLoader(MyDataset(test_df), batch_size=BATCH_SIZE, shuffle=True)
    return train_dataloader, test_dataloader

def predict(model: torch.nn.Module, img_model: torch.nn.Module):
    df = pd.read_csv('Data/test.csv')
    dataloader = DataLoader(MyDataset(df), batch_size=BATCH_SIZE, shuffle=False)
    n = df.shape[0]
    predicted = torch.Tensor([]).to(device=device)
    with torch.no_grad():
        for input in dataloader:
#             _, pred = torch.max(model(input), 1)
            # pred = torch.round((2*torch.max(model(input), dim=1)[1] + torch.max(img_model(input), dim=1)[1])/3)
            _, pred = torch.max(2*model(input) + img_model(input), dim=1)
            predicted = torch.concat((predicted, pred), dim=0)
            if (predicted.shape[0]%(64*5) == 0 or predicted.shape[0] == n):
                print(f'Running: {100 * predicted.shape[0] / n}%')
    assert predicted.shape[0] == n
    df['Rating'] = predicted.tolist()
    submission = pd.concat([df['RevId'], df['Rating']], axis=1)
    submission.to_csv(f'Results/submission.csv', index=False)

Setup

In [50]:
model = torch.load('Model/model.pt', map_location=torch.device(device))
cnn = torch.load('Model/cnn.pt', map_location=torch.device(device))
# model = MyModel().to(device)
# cnn = Basic_CNN_Module().to(device)
loss_func = torch.nn.CrossEntropyLoss()
img_loss_func = torch.nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=2e-5)
img_optimizer = Adam(cnn.parameters(), lr=LR)
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
train_dataloader, test_dataloader = get_dataloader()
lr_scheduler = get_linear_schedule_with_warmup(
                optimizer, 
                num_warmup_steps=0, 
                num_training_steps=len(train_dataloader)*EPOCHS
            )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Train, save model

In [51]:
# for epoch in range(EPOCHS):
#     train(model=model, img_model=cnn, loss_func=loss_func, img_loss_func=img_loss_func, optimizer=optimizer, img_optimizer=img_optimizer, data_loader=train_dataloader, epoch=epoch, lr_scheduler=lr_scheduler)
#     test(model=model, img_model=cnn, data_loader=test_dataloader, loss_func=loss_func, img_loss_func=img_loss_func)
# torch.save(model, '/kaggle/working/model.pt')
# torch.save(cnn, '/kaggle/working/cnn.pt')

Run predict

In [52]:
predict(model, cnn)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Running: 6.270821085635901%
Running: 12.541642171271802%
Running: 18.8124632569077%
Running: 25.083284342543603%
Running: 31.3541054281795%
Running: 37.6249265138154%
Running: 43.895747599451305%
Running: 50.166568685087206%
Running: 56.4373897707231%
Running: 62.708210856359%
Running: 68.9790319419949%
Running: 75.2498530276308%
Running: 81.52067411326671%
Running: 87.79149519890261%
Running: 94.06231628453851%
Running: 100.0%


In [53]:
a = torch.tensor([[1, 1], [2, 2]])
b = torch.tensor([[9, 1], [2, 9]])
print(2*a+b)

tensor([[11,  3],
        [ 6, 13]])
