### PROJECT SOLUTIION

## 1. SET UP THE ENVIRONMENT

In [1]:
import json
from pathlib import Path

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("training")
path_to_test = Path("test")


In [2]:
#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

## 2. FIRST MODEL : ENCODING UTERANCE BY UTERANCES

### 2.1 ADDING CONTEXT WITH CONTEXT GRAPH

In [3]:
#####
# text_baseline: utterances are embedded with SentenceTransformer, then train a classifier.
#####
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-mpnet-base-v2')

y_training = []
with open("training_labels.json", "r") as file:
    training_labels = json.load(file)
X_training = []

for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.txt", "r") as discourse_graph_file:

        with open(path_to_training / f"{transcription_id}.json", "r") as dialogue_file:

            transcription = json.load(dialogue_file)

            for utterance in transcription:
                for lines in discourse_graph_file.readlines():
                    lines = lines.split(" ")
                    index_0 = lines[0]
                    comment = lines[1]
                    index_1 = lines[2]
                
                    if utterance["index"] == index_0:
                        utterance["speaker"] += comment
                        #utterance["speaker"] += " on"
                        #for utterance_2 in transcription:
                        #    if utterance_2["index"] == index_1:
                        #        utterance["speaker"] += utterance_2["text"]
                                
                X_training.append(utterance["speaker"] + ": " + utterance["text"])

            y_training += [float(label) for label in training_labels[transcription_id]]


  from .autonotebook import tqdm as notebook_tqdm


### 2.2 ENCODING WITH BERT

In [None]:
X_training = bert.encode(X_training, show_progress_bar=True)

### 2.3 TRAINING MODELS

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import torch
import torch.nn as nn
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [7]:
CUDA_LAUNCH_BLOCKING=1
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return len(self.X)

train_data, val_data = train_test_split(Dataset(X_training, y_training), test_size=0.2, random_state=0)

# create training dataloader
train_loader = torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=128, shuffle=True)

# load model
device = torch.device("cuda:0")
model = nn.Sequential(
    nn.Linear(768, 256),
    nn.ReLU(),
    nn.Dropout(p=.3),
    
    nn.Linear(256, 32),
    nn.ReLU(),

    nn.Linear(32, 1),
    nn.Sigmoid()
).to(device)

In [2]:
# Define the train function to train the model
def train(model, optimizer, criterion, scheduler, train_loader, val_loader, n_epochs, device="cpu"):
  """
  Train the model for a specified number of epochs.

  Args:
  - model: The model to be trained
  - optimizer: The optimizer used for training
  - criterion: The loss function
  - scheduler: The learning rate scheduler
  - train_loader: The data loader for the training set
  - val_loader: The data loader for the validation set
  - n_epochs: The number of epochs to train for
  - device: The device to run the training on (default: "cpu")

  Returns:
  - train_loss: List of training losses for each epoch
  - val_loss: List of validation losses for each epoch
  - train_acc: List of training accuracies for each epoch
  - val_acc: List of validation accuracies for each epoch
  - train_f1: List of training F1 scores for each epoch
  - val_f1: List of validation F1 scores for each epoch
  """

  # Metrics for each epoch
  train_loss = []
  val_loss = []
  train_acc = []
  val_acc = []
  train_f1 = []
  val_f1 = []

  for epoch in range(n_epochs):
    train_loss_sublist = []
    train_acc_sublist = []
    train_f1_sublist = []

    # Training loop
    for data, label in train_loader:
      model.train()
      optimizer.zero_grad()
      output = model(data.to(device))
      loss = criterion(output.view(-1), label.to(device))
      loss.backward()
      optimizer.step()

      prediction = torch.LongTensor([int(.5 + res) for res in output.view(-1).cpu().detach().numpy()]) 

      train_loss_sublist.append(loss.item())
      train_acc_sublist.append(accuracy_score(label.cpu(), prediction))
      train_f1_sublist.append(f1_score(label.cpu(), prediction))

    train_loss.append(np.mean(train_loss_sublist))
    train_acc.append(np.mean(train_acc_sublist))
    train_f1.append(np.mean(train_f1_sublist))

    val_loss_sublist = []
    val_acc_sublist = []
    val_f1_sublist = []

    # Validation loop
    for data, label in val_loader:
      model.eval()
      with torch.no_grad():
        output = model(data.to(device))
        loss = criterion(output.view(-1), label.to(device))

        prediction = torch.LongTensor([int(.5 + res) for res in output.view(-1).cpu().detach().numpy()]) 

        val_loss_sublist.append(loss.item())
        val_acc_sublist.append(accuracy_score(label.cpu(), prediction))
        val_f1_sublist.append(f1_score(label.cpu(), prediction))

    val_loss.append(np.mean(val_loss_sublist))
    val_acc.append(np.mean(val_acc_sublist))
    val_f1.append(np.mean(val_f1_sublist))

    # Check if scheduler is not None
    if scheduler is not None:
      scheduler.step(val_loss[-1])

    torch.save(model.state_dict(), 'final_model/model' + str(epoch) + '.pt')

    print(f"Epoch {epoch} : train loss {train_loss[-1]}, val loss {val_loss[-1]}, train acc {train_acc[-1]}, val acc {val_acc[-1]}, train f1 {train_f1[-1]}, val f1 {val_f1[-1]}")

  return train_loss, val_loss, train_acc, val_acc, train_f1, val_f1

In [9]:
def plotter(train_loss, val_loss, train_acc, val_acc, train_f1, val_f1):
  fig, axs = plt.subplots(1, 3, figsize=(20,5))
  axs[0].plot(train_loss, label="Train Loss")
  axs[0].plot(val_loss, label="Val Loss")
  axs[0].legend()

  axs[1].plot(train_acc, label="Train Accuracy")
  axs[1].plot(val_acc, label="Val Accuracy")
  axs[1].legend()

  axs[2].plot(train_f1, label="Train F1")
  axs[2].plot(val_f1, label="Val F1")
  axs[2].legend()

  plt.show()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
criterion = nn.CrossEntropyLoss()

# adaptative lr
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=.1, patience=20, verbose=True)

train_loss, val_loss, train_acc, val_acc, train_f1, val_f1 = train(model, optimizer, criterion, scheduler, train_loader, val_loader, 40, "cuda") 

plotter(train_loss, val_loss, train_acc, val_acc, train_f1, val_f1)

### 2.3 MAKING PREDICTION

In [None]:
path_chosen = 'model32_mlp'
model.load_state_dict(torch.load('models/' + path_chosen + '.pt'))

In [12]:
# Encoding test set

test_labels = {}

for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.txt", "r") as discourse_graph_file:

        with open(path_to_test/ f"{transcription_id}.json", "r") as dialogue_file:

            transcription = json.load(dialogue_file)

            X_test = []
            for utterance in transcription:
                for lines in discourse_graph_file.readlines():
                    lines = lines.split(" ")
                    index_0 = lines[0]
                    comment = lines[1]
                    index_1 = lines[2]
                    
                    if utterance["index"] == index_0:
                        utterance["speaker"] += comment
                        #utterance["speaker"] += " on"
                        #for utterance_2 in transcription:
                        #    if utterance_2["index"] == index_1:
                        #        utterance["speaker"] += utterance_2["text"]
                                
                X_test.append(utterance["speaker"] + ": " + utterance["text"])
            X_test = bert.encode(X_test)
            
        y_test = model(torch.Tensor(X_test).to(device))
        test_labels[transcription_id] = [int(.5 + res[0]) for res in y_test.tolist()]


In [26]:
model_name = 'MLP3'
with open(f"model_results/{model_name}_test_labels.json", "w") as file:
    json.dump(test_labels, file, indent=4)