In [None]:
import torch
from datasets import load_dataset
import nltk
from collections import Counter
import itertools
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import random
from gensim.models import Word2Vec
import gensim.downloader as api
import time
import fasttext.util

In [None]:
nltk.download("punkt_tab", quiet=True)
nltk.download("stopwords", quiet=True)

ds = load_dataset("AmazonScience/massive", "fr-FR")
class_name = "scenario"


X_train = ds["train"]["utt"]
Y_train = ds["train"][class_name]
X_valid = ds["validation"]["utt"]
Y_valid = ds["validation"][class_name]
X_test = ds["test"]["utt"]
Y_test = ds["test"][class_name]

In [None]:
w2v_pretrained = api.load("word2vec-google-news-300")

In [None]:
#RUN ONLY IF YOU DON'T RUN THE PREVIOUS BLOCK, ELSE TAKES TOO MUCH MEMORY

#fasttext.util.download_model('fr', if_exists='ignore')  
#ft = fasttext.load_model('cc.fr.300.bin')

In [None]:
_SCENARIOS = ['social', 'transport', 'calendar', 'play', 'news', 'datetime', 'recommendation', 'email',
              'iot', 'general', 'audio', 'lists', 'qa', 'cooking', 'takeaway', 'music', 'alarm', 'weather']

In [None]:
def get_index(traindata, validdata):
    X_train_tokenized = [nltk.word_tokenize(x) for x in traindata]
    X_valid_tokenized = [nltk.word_tokenize(x) for x in validdata]

    #Create the word corpus to create embeddings
    corpus = Counter(list(itertools.chain(*X_train_tokenized)))
    corpus = sorted(corpus,key=corpus.get,reverse=True)
    onehot_dict = {w:i+1 for i,w in enumerate(corpus)}

    #Create the embeddings
    X_train_embeddings = [[onehot_dict[word] for word in sentence] for sentence in X_train_tokenized]
    #We use an abritrary value for unk words (last index) -> no info from these words
    X_valid_embeddings = [[onehot_dict[word] if word in onehot_dict else len(onehot_dict) for word in sentence] for sentence in X_valid_tokenized]
    return X_train_embeddings, X_valid_embeddings

def get_w2v(traindata, validdata, vector_size=100, window=5, w2v_encoder=None, fasttext=False):
    X_train_tokenized = [nltk.word_tokenize(x) for x in traindata]
    X_valid_tokenized = [nltk.word_tokenize(x) for x in validdata]
    if w2v_encoder == None:
        w2v_encoder_model = Word2Vec(sentences=X_train_tokenized, vector_size=vector_size, window=window, min_count=1, workers=4)
        w2v_encoder = w2v_encoder_model.wv
    if fasttext:
        vector_size = w2v_encoder.get_dimension()
    else:
        vector_size = w2v_encoder.vector_size
    train_tensors = [torch.tensor([w2v_encoder[word] if word in w2v_encoder else [0.] * vector_size for word in x]) for x in X_train_tokenized]
    valid_tensors = [torch.tensor([w2v_encoder[word] if word in w2v_encoder else [0.] * vector_size for word in x]) for x in X_valid_tokenized]
    return train_tensors, valid_tensors


def get_w2v_test(traindata, validdata, vector_size=100, window=5, w2v_encoder=None):
    #X_train_tokenized = [[_SCENARIOS[y], "<BOS>"] + nltk.word_tokenize(x) for x,y in traindata]
    #X_valid_tokenized = [[_SCENARIOS[y], "<BOS>"] + nltk.word_tokenize(x) for x,y in validdata]
    X_train_tokenized = [nltk.tokenize.wordpunct_tokenize(x) for x in traindata]
    X_valid_tokenized = [nltk.tokenize.wordpunct_tokenize(x) for x in validdata]
    if w2v_encoder == None:
        w2v_encoder_model = Word2Vec(sentences=X_train_tokenized, vector_size=vector_size, window=window, min_count=1, workers=4)
        w2v_encoder = w2v_encoder_model.wv
    train_tensors = [torch.tensor([w2v_encoder[word] if word in w2v_encoder else [0.] * w2v_encoder.get_dimension() for word in x[:-1]]) for x in X_train_tokenized if len(x) > 1]
    valid_tensors = [torch.tensor([w2v_encoder[word] if word in w2v_encoder else [0.] * w2v_encoder.get_dimension() for word in x[:-1]]) for x in X_valid_tokenized if len(x) > 1]
    train_targets_tensors = [torch.tensor([w2v_encoder[word] if word in w2v_encoder else [0.] * w2v_encoder.get_dimension() for word in x[1:]]) for x in X_train_tokenized if len(x) > 1]
    valid_target_tensors = [torch.tensor([w2v_encoder[word] if word in w2v_encoder else [0.] * w2v_encoder.get_dimension() for word in x[1:]]) for x in X_valid_tokenized if len(x) > 1]
    return train_tensors, train_targets_tensors, valid_tensors, valid_target_tensors

def get_vocab_size():
    X_train_tokenized = [nltk.word_tokenize(x) for x in X_train]
    corpus = Counter(list(itertools.chain(*X_train_tokenized)))
    return len(corpus)

vocab_size = get_vocab_size()

def collate_pack_onehot(batch):
    data = [nn.functional.one_hot(item[0], num_classes=vocab_size + 1).float() for item in batch]
    packed_data = nn.utils.rnn.pack_sequence(data, enforce_sorted=False)
    target = [item[1] for item in batch]
    return packed_data, torch.tensor(target)

def collate_pack(batch):
    data = [item[0] for item in batch]
    packed_data = nn.utils.rnn.pack_sequence(data, enforce_sorted=False)
    target = [item[1] for item in batch]
    return packed_data, torch.tensor(target)

def createData(method, batch_size=25, w2v_vector_size=100, w2v_window=5):
    if method == "index":
        return get_index(X_train, X_valid)
    
    elif method == "padded_index":
        embedded_train, embedded_valid = get_index(X_train, X_valid)
        #padding embedding vectors to max sentence length
        max_sentence_length = len(max(embedded_train, key=len))
        X_train_padded = [np.pad(x, (0, max_sentence_length - len(x)), 'constant', constant_values=(0,0)) for x in embedded_train]

        #truncate if needed before padding
        embedded_valid = [x[:max_sentence_length] for x in embedded_valid]
        X_valid_padded = [np.pad(x, (0, max_sentence_length - len(x)), 'constant', constant_values=(0,0)) for x in embedded_valid]
        return X_train_padded, X_valid_padded
    
    elif method == "onehot":
        embedded_train, embedded_valid = get_index(X_train, X_valid)
        train_tensors = [torch.tensor(x) for x in embedded_train]
        train_tensors = list(zip(train_tensors, torch.tensor(Y_train)))
        valid_tensors = [torch.tensor(x) for x in embedded_valid]
        valid_tensors = list(zip(valid_tensors, torch.tensor(Y_valid)))
        trainloader = DataLoader(train_tensors, collate_fn=collate_pack_onehot, shuffle=True, batch_size=batch_size)
        validloader = DataLoader(valid_tensors, collate_fn=collate_pack_onehot, batch_size=batch_size)
        return trainloader, validloader

    elif method == "tf-idf":
        vectorizer = TfidfVectorizer()
        X_train_tfidf = torch.Tensor(vectorizer.fit_transform(X_train).toarray())
        X_valid_tfidf = torch.Tensor(vectorizer.transform(X_valid).toarray())
        train_data_tfidf = TensorDataset(X_train_tfidf, torch.tensor(Y_train))
        valid_data_tfidf = TensorDataset(X_valid_tfidf, torch.tensor(Y_valid))
        trainloader = DataLoader(train_data_tfidf, shuffle=True, batch_size=batch_size)
        validloader = DataLoader(valid_data_tfidf, batch_size=batch_size)
        return trainloader, validloader
    
    elif method == "word2vec": #one input will be a packed list of tensors of size (len(sentence), w2v_model.vector_size)
        train_tensors, valid_tensors = get_w2v(X_train, X_valid, w2v_vector_size, w2v_window)
        train_tensors = list(zip(train_tensors, torch.tensor(Y_train)))
        valid_tensors = list(zip(valid_tensors, torch.tensor(Y_valid)))
        trainloader = DataLoader(train_tensors, shuffle=True, batch_size=batch_size, collate_fn=collate_pack)
        validloader = DataLoader(valid_tensors, batch_size=batch_size, collate_fn=collate_pack)
        return trainloader, validloader
    
    elif method == "word2vec_mean": #one input will be a tensor with size (w2v_model.vector_size)
        train_tensors, valid_tensors = get_w2v(X_train, X_valid, w2v_vector_size, w2v_window)
        train_tensors = [torch.mean(x, dim=0, dtype=float) if x.numel() != 0 else torch.zeros(w2v_vector_size) for x in train_tensors]
        valid_tensors = [torch.mean(x, dim=0, dtype=float) if x.numel() != 0 else torch.zeros(w2v_vector_size) for x in valid_tensors]
        train_data = TensorDataset(torch.stack(train_tensors), torch.tensor(Y_train))
        valid_data = TensorDataset(torch.stack(valid_tensors), torch.tensor(Y_valid))
        trainloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
        validloader = DataLoader(valid_data, batch_size=batch_size)
        return trainloader, validloader
    
    elif method == "word2vec_pretrained":
        train_tensors, valid_tensors = get_w2v(X_train, X_valid, w2v_encoder=w2v_pretrained)
        train_tensors = list(zip(train_tensors, torch.tensor(Y_train)))
        valid_tensors = list(zip(valid_tensors, torch.tensor(Y_valid)))
        trainloader = DataLoader(train_tensors, shuffle=True, batch_size=batch_size, collate_fn=collate_pack)
        validloader = DataLoader(valid_tensors, batch_size=batch_size, collate_fn=collate_pack)
        return trainloader, validloader
    
    elif method == "word2vec_pretrained_mean":
        train_tensors, valid_tensors = get_w2v(X_train, X_valid, w2v_encoder=w2v_pretrained)
        train_tensors = [torch.mean(x, dim=0, dtype=float) if x.numel() != 0 else torch.zeros(w2v_pretrained.vector_size, dtype=float) for x in train_tensors]
        valid_tensors = [torch.mean(x, dim=0, dtype=float) if x.numel() != 0 else torch.zeros(w2v_pretrained.vector_size, dtype=float) for x in valid_tensors]
        train_data = TensorDataset(torch.stack(train_tensors), torch.tensor(Y_train))
        valid_data = TensorDataset(torch.stack(valid_tensors), torch.tensor(Y_valid))
        trainloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
        validloader = DataLoader(valid_data, batch_size=batch_size)
        return trainloader, validloader
    
    elif method == "word2vec_pretrained_ft":
        train_tensors, valid_tensors = get_w2v(X_train, X_valid, w2v_encoder=ft, fasttext=True)
        train_tensors = list(zip(train_tensors, torch.tensor(Y_train)))
        valid_tensors = list(zip(valid_tensors, torch.tensor(Y_valid)))
        trainloader = DataLoader(train_tensors, shuffle=True, batch_size=batch_size, collate_fn=collate_pack)
        validloader = DataLoader(valid_tensors, batch_size=batch_size, collate_fn=collate_pack)
        return trainloader, validloader
    
    elif method == "generation":
        train_tensors, train_targets, valid_tensors, valid_targets = get_w2v_test(X_train, X_valid, w2v_encoder=ft)
        train_data = list(zip(train_tensors, train_targets))
        valid_data = list(zip(valid_tensors, valid_targets))
        return train_data, valid_data

    else:
        raise Exception("createData: method not recognized")

In [None]:
#Model definitions

class Feedforward(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, activation_function):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(0.6)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.activation_function = activation_function
    
    def forward(self, text):
        hidden = self.fc1(text.float())
        hidden = self.activation_function(hidden)

        hidden = self.dropout(hidden)
        out = self.fc2(hidden)
        return out
    
class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, bidirectional=False):
        super().__init__()
        self.rnn = nn.RNN(input_dim, hidden_dim, bidirectional=bidirectional, batch_first=True, num_layers=n_layers, dropout=0.5)
        self.h2o = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax()

    def forward(self, text):
        rnn_output, hidden = self.rnn(text)
        out = self.h2o(hidden[0])
        out = self.softmax(out)
        return out
    
class GenRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, bidirectional=False):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.i2h = nn.Linear(input_dim + hidden_dim, hidden_dim)
        self.i2o = nn.Linear(input_dim + hidden_dim, output_dim)
        self.o2o = nn.Linear(hidden_dim + output_dim, output_dim)
        self.softmax = nn.LogSoftmax()

    def forward(self, text, hidden):
        input_combined = torch.cat((text, hidden))
        hidden = torch.tanh(self.i2h(input_combined))
        output = torch.tanh(self.i2o(input_combined))
        output_combined = torch.cat((hidden, output))
        output = self.o2o(output_combined)
        output = self.softmax(output)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(self.hidden_dim)
    
#essayer apres de batch et utiliser rnn en ajoutant la loss de chaque etape retournee par self.rnn
class GenRNN_test(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, bidirectional=False):
        super().__init__()
        self.rnn = nn.RNN(input_dim, hidden_dim, bidirectional=bidirectional, batch_first=True, num_layers=n_layers, dropout=0.5)
        self.h2o = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, text):
        rnn_output, hidden = self.rnn(text)
        out = self.h2(rnn_output)
        out = self.softmax(out)
        return out

In [None]:
#Function to evaluate a model's predictions. Returns (loss, accuracy)
def evaluate(model, dataloader, criterion=nn.CrossEntropyLoss()):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    for sentences, labels in dataloader:
        outputs = model.forward(sentences.float())
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        #Calculate correct predictions for this batch
        predictions = torch.argmax(outputs, dim=1)
        correct += (predictions == labels).sum().item()
        total += len(predictions)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

In [None]:
def train(model, training_data, n_epoch, learning_rate, report_every = 1, criterion = nn.CrossEntropyLoss(), optimizer = torch.optim.SGD, 
          needsOnehot = False, eval=False, validloader=None):
    current_loss = 0
    all_losses = []
    model.train()
    optimizer = optimizer(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')
    vocab_size = -1
    if needsOnehot:
        vocab_size = get_vocab_size()

    #print(f"training on data set with n_batches = {len(training_data)}")

    for iter in range(1, n_epoch + 1):
        model.zero_grad() # clear the gradients

        for sentences, labels in training_data:
            input_tensor = sentences
            if needsOnehot:
                input_tensor = nn.functional.one_hot(sentences, num_classes=vocab_size + 1)
            output = model.forward(input_tensor.float())
            loss = criterion(output, labels)
            # optimize parameters
            loss.backward()
            #nn.utils.clip_grad_norm_(model.parameters(), 3)
            optimizer.step()
            optimizer.zero_grad()

            current_loss += loss.item()
        
        scheduler.step(current_loss)
        if eval:
            valid_loss, valid_acc = evaluate(model, validloader)
            model.train()
            all_losses.append((current_loss / len(training_data), valid_loss, valid_acc))
        else:
            all_losses.append(current_loss / len(training_data))
        if report_every != 0 and iter % report_every == 0:
            if eval:
                lr = optimizer.param_groups[0]["lr"]
                print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1][0]}  ||  validation: loss = {all_losses[-1][1]}, accuracy = {all_losses[-1][2]}, lr = {lr}")
            else:
                print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1]}")
        current_loss = 0

    return all_losses

In [None]:
trainloader, validloader = createData("tf-idf", 25)
model = Feedforward(len(trainloader.dataset[0][0]), 100, 18, nn.functional.relu)

ff_tfidf_results = train(model, trainloader, 100, 1e-4, optimizer=torch.optim.Adam, eval=True, validloader=validloader)

In [None]:
trainloader, validloader = createData("onehot")

rnn = RNN(vocab_size + 1, 500, 18, 3)
rnn_onehot_results = train(rnn, trainloader, 100, 1e-4, eval=True, validloader=validloader, optimizer=torch.optim.Adam)

In [None]:
trainloader, validloader = createData("word2vec", batch_size=25, w2v_vector_size=100, w2v_window=5)
rnn_w2v = RNN(trainloader.dataset[0][0].size()[1], 100, 18, 1, bidirectional=True)
rnn_w2v_results = train(rnn_w2v, trainloader, 100, 0.1, eval=True, validloader=validloader)

In [None]:
trainloader, validloader = createData("word2vec_mean", batch_size=25, w2v_vector_size=100, w2v_window=5)
model = Feedforward(len(trainloader.dataset[0][0]), 1000, 18, nn.functional.relu)

FF_w2v_mean_results = train(model, trainloader, 100, 0.001, optimizer=torch.optim.Adam, eval=True, validloader=validloader)

In [None]:
trainloader, validloader = createData("word2vec_pretrained_mean", batch_size=25)
model = Feedforward(len(trainloader.dataset[0][0]), 1000, 18, nn.functional.relu)

FF_w2v_pretrained_mean_results = train(model, trainloader, 100, 0.001, optimizer=torch.optim.Adam, eval=True, validloader=validloader)

In [None]:
trainloader, validloader = createData("word2vec_pretrained", batch_size=25)
rnn_w2v = RNN(trainloader.dataset[0][0].size()[1], 100, 18, 1, bidirectional=False)
rnn_w2v_pretrained_results = train(rnn_w2v, trainloader, 20, 0.05, eval=True, validloader=validloader)

In [None]:
trainloader, validloader = createData("word2vec_pretrained_ft", batch_size=25)
rnn_w2v_ft = RNN(trainloader.dataset[0][0].size()[1], 100, 18, 4, bidirectional=False)
rnn_w2v_ft_pretrained_results = train(rnn_w2v_ft, trainloader, 100, 0.01, eval=True, validloader=validloader)

# Models evaluations:

### RNN:
| Model     | hidden_size | learning_rate | layers | epochs | train_loss | valid_loss | valid_accuracy | test_loss | test_accuracy | optimizer
| --------- | ----------- | ------------- | ------ | ------ | ---------- | ---------- | -------------- | --------- | ------------- | ---------
| onehot    | 100         |  0.01         |   4    |  30    | 0.0094407  | 0.80427022 | 0.8440727988   |           |               | Adam
| w2v       | 100         |  0.1          |   4    |  30    | 0.6402964  | 1.66495737 | 0.5533694048   |           |               | SGD
| w2v_google| 100         |  0.01         |   4    |  100   | 0.4981368  | 0.84794813 | 0.7668470241   |           |               | SGD
| w2v_fasttext| 100       |  0.01         |   1    |  200   | 0.1366848  | 0.89333971 | 0.7899655681   |           |               | SGD

##### Notes:
- for onehot, overfitting is a problem, especially after 30 epochs where the validation loss starts to increase

### FeedForward:
| Model     | hidden_size | learning_rate | epochs | train_loss | valid_loss | valid_accuracy | test_loss | test_accuracy | optimizer
| --------- | ----------- | ------------- | ------ | ---------- | ---------- | -------------- | --------- | ------------- | ---------
| tf-idf    | 100         |  1e-4         |  100   | 0.0910010  | 0.40158849 | 0.8878504672   |           |               | Adam
| w2v_mean  | 1000        |  0.001        |  100   | 1.6188243  | 1.52717660 | 0.5371372356   |           |               | Adam
| w2v_mean  | 1000        |  0.001        |  100   | 0.3744488  | 1.26549924 | 0.7412690605   |           |               | Adam

In [None]:
def train_gen(model, training_data, n_epoch, learning_rate, report_every = 1, criterion = nn.CrossEntropyLoss(), optimizer = torch.optim.SGD, 
          needsOnehot = False, eval=False, validloader=None):
    current_loss = 0
    all_losses = []
    model.train()
    optimizer = optimizer(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')
    vocab_size = -1
    if needsOnehot:
        vocab_size = get_vocab_size()

    #print(f"training on data set with n_batches = {len(training_data)}")

    for iter in range(1, n_epoch + 1):
        model.zero_grad() # clear the gradients

        for inputs, targets in training_data:
            hidden = model.initHidden()
            loss = 0
            for i in range(len(inputs)):
                output, hidden = model.forward(inputs[i].float(), hidden)
                loss_temp = criterion(output, targets[i])
                loss += loss_temp
                
            # optimize parameters
            loss.backward()
            #nn.utils.clip_grad_norm_(model.parameters(), 3)
            optimizer.step()
            optimizer.zero_grad()

            current_loss += loss.item()
        
        scheduler.step(current_loss)
        if eval:
            valid_loss, valid_acc = evaluate(model, validloader)
            model.train()
            all_losses.append((current_loss / len(training_data), valid_loss, valid_acc))
        else:
            all_losses.append(current_loss / len(training_data))
        if report_every != 0 and iter % report_every == 0:
            if eval:
                lr = optimizer.param_groups[0]["lr"]
                print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1][0]}  ||  validation: loss = {all_losses[-1][1]}, accuracy = {all_losses[-1][2]}, lr = {lr}")
            else:
                print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1]}")
        current_loss = 0

    return all_losses

In [None]:
traindata, validdata = createData("generation", batch_size=25)
rnn_gen = GenRNN(len(traindata[0][0][0]), 100, len(traindata[0][0][0]), 1, bidirectional=False)

train_gen(rnn_gen, traindata, 20, 0.05, eval=False, validloader=validdata)

In [None]:
ft[""]