# TASK
## Deadline: 31 martie ora 23:59.

Formular pentru trimiterea temei: https://forms.gle/Bznaciv2MTy4kVL47

Folosind intreg datasetul de mai sus (IMDb reviews) implementati urmatoarele cerinte:
1. Impartiti setul de date in 80% train, 10% validare si 10% test
2. Tokenizati textele si determinati vocabularul (in acest task vom lucra cu reprezentari la nivel de cuvant, NU la nivel de caracter); intrucat vocabularul poate fi foarte mare, incercati sa aplicati una dintre tehnicile mentionate in laborator (10K-20K de cuvinte ar fi o dimensiunea rezonabila a vocabularului)
3. Transformati textele in vectori de aceeasi dimensiune folosind indexul vocabularului (alegeti o dimensiune maxima de circa 500-1000 de tokens)
4. Implementati urmatoarea arhitectura:
    * un Embedding layer pentru vocabularul determinat, ce contine vectori de dimensiune 100
    * un layer dropout cu probabilitate 0.4
    * un layer convolutional 1D cu 100 canale de input si 128 de canale de output, dimensiunea kernelului de 3 si padding 1; asupra rezultatului aplicati un layer de [BatchNormalization](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html) cu 128 features; aplicati apoi functia de activare ReLU, iar in cele din urma un strat de max-pooling 1D cu kernel size 2.
    * un layer convolutional 1D cu 128 canale de input si 128 de canale de output, dimensiunea kernelului de 5 si padding 2; asupra rezultatului aplicati un layer de BatchNormalization cu 128 features; aplicati apoi functia de activare ReLU, iar in cele din urma un strat de max-pooling 1D cu kernel size 2.
    * un layer convolutional 1D cu 128 canale de input si 128 de canale de output, dimensiunea kernelului de 5 si padding 2; asupra rezultatului aplicati un layer de BatchNormalization cu 128 features; aplicati apoi functia de activare ReLU, iar in cele din urma un strat de max-pooling 1D cu kernel size 2.
    * asupra rezultatului ultimului layer, aplicati average-pooling 1D obtinand pentru fiecare canal media tuturor valorilor din vectorul sau corespunzator
    * un layer feed-forward (linear) cu dimensiunea inputului 128, si 2 noduri pentru output (pentru clasificare in 0/1)
5. Antrenati arhitectura folosind cross-entropy ca functie de loss si un optimizer la alegere. La finalul fiecarei epoci evaluati modelul pe datele de validare si salvati weighturile celui mai bun model astfel determinat
6. Evaluati cel mai bun model obtinut pe datele de test.


In [2]:
import string
from sys import platform, path
if platform == "linux" or platform == "linux2":
    path.append('/home/dariusbuhai/python/lib/python3.9/site-packages')
from urllib.request import urlretrieve
urlretrieve('https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv', 'IMDB_Dataset.csv')

('IMDB_Dataset.csv', <http.client.HTTPMessage at 0x7f6973ed9190>)

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd

data = pd.read_csv('IMDB_Dataset.csv')
train_df, test_df = train_test_split(data, test_size=0.1, random_state=1)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=1)

In [6]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/dariusbuhai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def transform_to_tokens(data):
    reviews = []
    for review in data:
        review_tokenized = word_tokenize(review.lower())
        reviews.append(review_tokenized)
    return reviews

train_reviews = transform_to_tokens(train_df.review)
test_reviews = transform_to_tokens(test_df.review)
val_reviews = transform_to_tokens(val_df.review)

In [8]:
import spacy
import string
nlp = spacy.load("en_core_web_md")

def no_punctuation_or_stopwords(data):
    all_words = []
    for word in data:
        if word not in nlp.Defaults.stop_words and word != ' ' and word not in string.punctuation:
            all_words.append(word)
    return all_words


In [9]:
import operator
from collections import Counter

def word_freq(data, min_aparitions):

    all_words = [words.lower() for sentences in data for words in sentences]
    sorted_vocab = sorted(dict(Counter(all_words)).items(), key=operator.itemgetter(1))
    final_vocab = [k for k,v in sorted_vocab if v > min_aparitions]

    return final_vocab

In [36]:
def create_vocab(reviews):
    vocab = word_freq(reviews, min_aparitions = 18)
    vocab = no_punctuation_or_stopwords(vocab)
    return vocab

In [37]:
vocab = create_vocab(train_reviews)

In [38]:
print(len(vocab))

18409


In [39]:
import torch
import numpy as np

def vectorize_sentences(data, char_indices, one_hot = False):
    vectorized = []
    for sentences in data:

        # transformam fiecare review in reprezentarea lui sub forma de indici ale caracterelor continute
        sentences_of_indices = [char_indices[w] if w in char_indices.keys() else char_indices['UNK'] for w in sentences]

        # pentru fiecare indice putem face reprezentarea one-hot corespunzatoare
        # sau putem sa nu facem asta si sa adaugam un embedding layer in model care face această transformare
        if one_hot:
            sentences_of_indices = np.eye(len(char_indices))[sentences_of_indices]

        vectorized.append(sentences_of_indices)

    return vectorized

def pad(samples, max_length):

    return torch.tensor([
        sample[:max_length] + [1] * max(0, max_length - len(sample))
        for sample in samples
    ])

In [59]:
def create_vectorize(vocab, reviews):
    word_indices = dict((c, i + 2) for i, c in enumerate(vocab))
    indices_word = dict((i + 2, c) for i, c in enumerate(vocab))
    indices_word[0] = 'UNK'
    word_indices['UNK'] = 0
    indices_word[1] = 'PAD'
    word_indices['PAD'] = 1

    reviews_vectorized = vectorize_sentences(reviews, word_indices)
    reviews_vectorized = pad(reviews_vectorized, max_length = 800)
    return reviews_vectorized

In [60]:
train_reviews_vectorized = create_vectorize(vocab, train_reviews)
test_reviews_vectorized = create_vectorize(vocab, test_reviews)
val_reviews_vectorized = create_vectorize(vocab, val_reviews)

In [61]:
print(train_reviews_vectorized.shape)

torch.Size([40500, 800])


In [62]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, samples, labels):
        self.samples = samples
        self.labels = labels

    def __getitem__(self, k):
        """Returneaza al k-lea exemplu din dataset"""
        return self.samples[k], self.labels[k]

    def __len__(self):
        """Returneaza dimensiunea datasetului"""
        return len(self.samples)

    * un Embedding layer pentru vocabularul determinat, ce contine vectori de dimensiune 100
    * un layer dropout cu probabilitate 0.4
    * un layer convolutional 1D cu 100 canale de input si 128 de canale de output, dimensiunea kernelului de 3 si padding 1; asupra rezultatului aplicati un layer de [BatchNormalization](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html) cu 128 features; aplicati apoi functia de activare ReLU, iar in cele din urma un strat de max-pooling 1D cu kernel size 2.
    * un layer convolutional 1D cu 128 canale de input si 128 de canale de output, dimensiunea kernelului de 5 si padding 2; asupra rezultatului aplicati un layer de BatchNormalization cu 128 features; aplicati apoi functia de activare ReLU, iar in cele din urma un strat de max-pooling 1D cu kernel size 2.
    * un layer convolutional 1D cu 128 canale de input si 128 de canale de output, dimensiunea kernelului de 5 si padding 2; asupra rezultatului aplicati un layer de BatchNormalization cu 128 features; aplicati apoi functia de activare ReLU, iar in cele din urma un strat de max-pooling 1D cu kernel size 2.
    * asupra rezultatului ultimului layer, aplicati average-pooling 1D obtinand pentru fiecare canal media tuturor valorilor din vectorul sau corespunzator
    * un layer feed-forward (linear) cu dimensiunea inputului 128, si 2 noduri pentru output (pentru clasificare in 0/1)

In [67]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = torch.nn.Embedding(train_reviews_vectorized.shape[0], 100, padding_idx=1)
        conv1 = torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=100, out_channels=128, kernel_size=3, padding=1),
            torch.nn.BatchNorm1d(num_features=128),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(kernel_size=2),
        )
        conv2 = torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5, padding=2),
            torch.nn.BatchNorm1d(num_features=128),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(kernel_size=2),
        )
        conv3 = torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5, padding=2),
            torch.nn.BatchNorm1d(num_features=128),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(kernel_size=2),
        )
        global_average = torch.nn.AvgPool1d(kernel_size=100, stride=128)
        self.convolutions = torch.nn.Sequential(
            torch.nn.Dropout(p=0.4),
            conv1,
            conv2,
            conv3,
            global_average
        )

        # Flattening layer
        flatten = torch.nn.Flatten()

        # Linear layer cu 128 input features și 2 outputs fără funcție de activare
        linear = torch.nn.Linear(in_features=128, out_features=2)

        self.classifier = torch.nn.Sequential(flatten, linear)

    def forward(self, input):
        embeddings = self.embedding(input)
        embeddings = embeddings.permute(0, 2, 1)
        output = self.convolutions(embeddings)
        output = self.classifier(output)
        return output

In [68]:
from torch.utils.data import DataLoader

model = Model()

# Adam optimizer cu lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Cross Entropy loss
loss_fn = torch.nn.CrossEntropyLoss()

# Create dataset and dataloaders
train_ds = Dataset(train_reviews_vectorized, train_df['sentiment'].tolist())
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)

val_ds = Dataset(val_reviews_vectorized, val_df['sentiment'].tolist())
val_dl = DataLoader(val_ds, batch_size=64, shuffle=True)

test_ds = Dataset(test_reviews_vectorized, test_df['sentiment'].tolist())
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False)

In [73]:
def train_model(epochs):
    best_val_acc = 0
    for epoch_n in range(epochs):
        print(f"Epoch #{epoch_n + 1}")
        model.train()
        for batch in train_dl:
            model.zero_grad()

            inputs, targets = batch
            inputs = inputs.float().long()
            targets = targets

            output = model(inputs)
            loss = loss_fn(output, targets)

            loss.backward()
            optimizer.step()

        # validare
        model.eval()
        all_predictions = torch.tensor([])
        all_targets = torch.tensor([])
        for batch in val_dl:
            inputs, targets = batch
            inputs = inputs.float().long()
            targets = targets

            with torch.no_grad():
                output = model(inputs)

            predictions = output.argmax(1)
            all_targets = torch.cat([all_targets, targets.detach().cpu()])
            all_predictions = torch.cat([all_predictions, predictions.detach().cpu()])

        val_acc = (all_predictions == all_targets).float().mean().numpy()
        print(val_acc)

        if val_acc > best_val_acc:
            torch.save(model.state_dict(), "./data/best_model.pt")
            best_val_acc = val_acc

    print("Best validation accuracy", best_val_acc)

In [None]:
train_model(100)

Epoch #1
0.86377776
Epoch #2


In [None]:
from tensorflow import metrics


def eval_model(dataloader):
    model.eval()
    true_labels, predicted_labels = [], []
    with torch.no_grad():
        for (vectors, labels) in dataloader:
            output = model(vectors)
            true_labels.extend(labels.tolist())
            predicted_labels.extend(output.max(1)[1].tolist())
    accuracy = metrics.accuracy_score(true_labels, predicted_labels)
    print("Evaluated Accuracy:", accuracy)

In [None]:
model.load_state_dict(torch.load("./data/best_model.pt"))
print("Evaluate test data:")
eval_model(test_dl)