### PROJECT SOLUTIION

## 1. SET UP THE ENVIRONMENT

In [1]:
import json
from pathlib import Path

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("training")
path_to_test = Path("test")

In [2]:
#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

## 2. FIRST MODEL : ENCODING UTERANCE BY UTERANCES

### 2.1 ADDING CONTEXT WITH CONTEXT GRAPH

In [3]:
#####
# text_baseline: utterances are embedded with SentenceTransformer, then train a classifier.
#####
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

y_training = []
with open("training_labels.json", "r") as file:
    training_labels = json.load(file)
X_training = []

for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.txt", "r") as discourse_graph_file:

        with open(path_to_test / f"{transcription_id}.json", "r") as dialogue_file:

            transcription = json.load(dialogue_file)

            for utterance in transcription:
                for lines in discourse_graph_file.readlines():
                    lines = lines.split(" ")
                    index_0 = lines[0]
                    comment = lines[1]
                    index_1 = lines[2]
                
                    if utterance["index"] == index_0:
                        utterance["speaker"] += comment
                        utterance["speaker"] += " on"
                        for utterance_2 in transcription:
                            if utterance_2["index"] == index_1:
                                utterance["speaker"] += utterance_2["text"]
                                
                X_training.append(utterance["speaker"] + ": " + utterance["text"])

            y_training += training_labels[transcription_id]


### 2.2 ENCODING WITH BERT

In [None]:
X_training = bert.encode(X_training, show_progress_bar=True)

### 2.3 TRAINING MODELS

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

models = {
    "decision_tree": DecisionTreeClassifier(random_state=0, max_depth=9),
}

for model in models:
    if model != "svc":
        models[model].fit(X_training, y_training)

### 2.3 MAKING PREDICTION

In [31]:
# Encoding test set

test_labels = { 
    "decision_tree": {}
}

for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.txt", "r") as discourse_graph_file:

        with open(path_to_test/ f"{transcription_id}.json", "r") as dialogue_file:

            transcription = json.load(dialogue_file)

            X_test = []
            for utterance in transcription:
                for lines in discourse_graph_file.readlines():
                    lines = lines.split(" ")
                    index_0 = lines[0]
                    comment = lines[1]
                    index_1 = lines[2]
                    
                    if utterance["index"] == index_0:
                        utterance["speaker"] += comment
                        utterance["speaker"] += " on"
                        for utterance_2 in transcription:
                            if utterance_2["index"] == index_1:
                                utterance["speaker"] += utterance_2["text"]
                                
                X_test.append(utterance["speaker"] + ": " + utterance["text"])
            X_test = bert.encode(X_test)
            
        for model in models:
            if model != "svc":
                y_test = models[model].predict(X_test)
                test_labels[model][transcription_id] = y_test.tolist()        


In [32]:
for model in models:
    if model != "svc":
        with open(f"{model}_test_labels_text_baseline.json", "w") as file:
            json.dump(test_labels[model], file, indent=4)