In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [None]:
!pip install sentence_transformers
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=31ce02520759138fe6759a7aa97d56389cc848292c862fc26c9cd965d8a62906
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_tr

## Preprocessing

In [None]:
import json
from pathlib import Path
import networkx as nx
from collections import defaultdict


path_to_training = Path("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/training")
path_to_test = Path("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/test")

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]


training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])


def read_discourse_graph(file_path):
    edges = []

    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            source = int(parts[0])
            relation = parts[1]
            target = int(parts[2])
            edges.append((source, target, {'relation' : relation}))

    return edges

def create_graph(edges):
    graph = nx.DiGraph()
    graph.add_edges_from(edges)
    return graph


y_training = [] # list of training_labels
word_training = [] # list of all utterances
graph_training = [] # list of all the graphs
tab_train = [defaultdict(int) for _ in range(100000)] # keeps the dataframe counting the relaions handled by each utterance
n= 0

with open("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/training_labels.json", "r") as file:
    training_labels = json.load(file)

for transcription_id in training_set:
    # Read the text file for the discourse graph and construct the dataframe
    graph_file_path = path_to_training / f"{transcription_id}.txt"
    edges = read_discourse_graph(graph_file_path)
    graph = create_graph(edges)
    graph_training.append(graph)
    for source , target , relation in edges:
      rel  =relation['relation']
      tab_train[n + source][rel] +=1
      tab_train[n + target][rel] +=1


    # Read the JSON file for the transcription
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    for utterance in transcription:
        word_training.append(utterance["speaker"] + ": " + utterance["text"])

    y_training += training_labels[transcription_id]
    n += len(training_labels[transcription_id])
tab_train = tab_train[:n]

# dataframe of utterances relations counts

ensemble_cles = set()
for dictionnaire in tab_train:
    ensemble_cles.update(dictionnaire.keys())
ensemble_cles = list(ensemble_cles)

matrice = []
for dictionnaire in tab_train:
    ligne = [dictionnaire.get(cle, 0) for cle in ensemble_cles]
    matrice.append(ligne)

# Creating the dataframe
tab_train = pd.DataFrame(matrice, columns=ensemble_cles)


In [None]:
# Adding data by 2 by 2 merging
word_training_augmented = list(word_training)
y_training_augmented = list(y_training)
tab_training_augmented = tab_train.copy()


i = 0
while i < len(y_training) - 1:
    if y_training[i] == 1:
        j = i+1
        while j < len(y_training) and y_training[j] == 0:
            j+=1
        if j < len(y_training):
            new_utterance = word_training[i] + ' ' + word_training[j]
            word_training_augmented.append(new_utterance)
            y_training_augmented.append(1)
            tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)

        i = j
    i+=1

print(tab_training_augmented[-10:])


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_

       Result  Parallel  Correction  Clarification_question  Acknowledgement  \
79259       0         0           0                       0                2   
79260       0         0           0                       0                1   
79261       0         0           0                       0                1   
79262       0         0           0                       0                1   
79263       1         0           0                       0                1   
79264       0         0           0                       0                2   
79265       0         0           0                       0                3   
79266       0         0           0                       2                1   
79267       0         0           0                       0                1   
79268       0         0           0                       1                0   

       Q-Elab  Question-answer_pair  Alternation  Elaboration  Explanation  \
79259       0                     0      

  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)
  tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab

# Embedding

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device : {device}')
e5 = SentenceTransformer('intfloat/e5-large-v2').to(device)

Using device : cuda


.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.8k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

handler.py:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

In [None]:
X_training = e5.encode(word_training_augmented, show_progress_bar=True, normalize_embeddings=True)
df_X_training = pd.DataFrame(X_training)

# Concatenate tab_train and X_training along the columns
X_training = pd.concat([tab_training_augmented, df_X_training], axis=1)

Batches:   0%|          | 0/2478 [00:00<?, ?it/s]

In [None]:
from scipy.stats import uniform, randint
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from tqdm import tqdm


# Define the parameter distribution
param_dist = {
    'n_estimators': randint(2000, 3000),
    'max_depth': randint(1, 6),
    'learning_rate': uniform(0.01, 0.02),
    'gamma': uniform(0, 1),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'lambda': uniform(1, 3),
    'alpha': uniform(0, 1),
    'scale_pos_weight': uniform(1, 10)
}

# Initialize the model
model = XGBClassifier(tree_method='hist', device = device)

# Perform 5-fold cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Define your custom scorer
scorer = make_scorer(f1_score)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    scoring=scorer,
    cv=cv,
    n_jobs=-1,
    n_iter=20,  # Number of parameter settings that are sampled
    random_state=40,
    verbose = 1
)


with tqdm(total=20, desc="RandomizedSearchCV") as pbar:
    # Fit RandomizedSearchCV
    random_search.fit(X_training, y_training_augmented)
    pbar.update(20)  # Manually update the progress bar to its completion


# Get the best parameters
best_params = random_search.best_params_

# Print the best parameters
print(f'Best parameters: {best_params}')

RandomizedSearchCV:   0%|          | 0/20 [00:00<?, ?it/s]

Fitting 3 folds for each of 20 candidates, totalling 60 fits


RandomizedSearchCV: 100%|██████████| 20/20 [48:33<00:00, 145.67s/it]

Best parameters: {'alpha': 0.6222478315652001, 'colsample_bytree': 0.6372408359360183, 'gamma': 0.6122007109199271, 'lambda': 2.9620950021720374, 'learning_rate': 0.019192308548646068, 'max_depth': 5, 'n_estimators': 2595, 'scale_pos_weight': 2.090527525231064, 'subsample': 0.54100974496226}





In [None]:

#best_params = {'alpha': 0.32518332202674705, 'colsample_bytree': 0.864803089169032, 'gamma': 0.6375574713552131, 'lambda': 3.6616382277289796, 'learning_rate': 0.019444298503238984, 'max_depth': 5, 'n_estimators': 2942, 'scale_pos_weight': 8.13244787222995, 'subsample': 0.8803925243084487}

# Initialize the model with the best parameters
model = XGBClassifier(tree_method='hist', device = device, **best_params )

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X_training, y_training_augmented, cv=cv, scoring=scorer, n_jobs=-1)


# Print cross-validated F1-Score
print(f'Cross-validated F1-Score: {cv_scores.mean()}')


Cross-validated F1-Score: 0.7182213442118908


In [None]:
X_training

Unnamed: 0,Elaboration,Parallel,Alternation,Correction,Q-Elab,Continuation,Background,Contrast,Comment,Question-answer_pair,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0,0,0,0,0,1,0,0,0,0,...,0.015771,0.025599,0.030799,0.022237,-0.003727,-0.008371,0.031861,-0.053412,0.029330,0.033565
1,0,0,0,0,0,2,0,0,0,0,...,0.019480,0.004748,0.011612,0.027755,-0.023661,-0.021694,0.016016,-0.025288,0.036440,0.034575
2,0,0,0,0,0,1,0,0,0,0,...,0.022065,-0.002219,0.041842,0.034843,0.014424,-0.018342,0.050130,-0.022857,0.023356,0.013921
3,1,0,0,0,0,0,0,0,0,0,...,0.025940,-0.006279,0.003061,0.005558,-0.001654,-0.033632,0.045703,-0.029262,0.005266,0.008678
4,1,0,0,0,0,1,0,0,0,0,...,0.039824,0.004701,0.019440,0.008280,0.031489,0.000436,0.051897,-0.023048,0.030434,0.045774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79264,1,0,0,0,0,0,0,2,2,0,...,0.002473,-0.008396,0.040128,0.031994,0.001725,-0.015183,0.041526,-0.027731,0.040979,0.012512
79265,1,0,0,0,0,0,0,4,0,0,...,0.015440,-0.017728,0.032985,0.031826,0.044799,-0.043217,0.048364,-0.026522,0.049801,0.029518
79266,1,0,0,0,0,1,0,1,0,1,...,0.012885,0.004859,0.009011,0.010600,-0.009004,-0.000053,0.075668,-0.001160,0.045767,0.016954
79267,1,0,0,0,0,0,0,0,1,0,...,0.015229,-0.005967,0.038900,0.025195,0.014046,-0.036590,0.032905,-0.017802,0.024629,0.026444


In [None]:

# Fit the model to your training data
model.fit(X_training, y_training_augmented)

# Predict on your validation set
y_pred = model.predict(X_training)

# Evaluate the model using f1-score
score = f1_score(y_training_augmented, y_pred)

print(f'F1-Score: {score}')

In [None]:
# score on the original dataset

X_training_0 = e5.encode(word_training, show_progress_bar=True, normalize_embeddings=True)
df_X_training_0 = pd.DataFrame(X_training_0)

# Concatenate tab_train and X_training along the columns

X_training_0 = pd.concat([tab_train, df_X_training_0], axis=1)

# Predict on your validation set
y_pred_0 = model.predict(X_training_0)

# Evaluate the model using f1-score
score = f1_score(y_training, y_pred_0)

print(f'F1-Score: {score}')

Batches:   0%|          | 0/2270 [00:00<?, ?it/s]

F1-Score: 0.8858278699958154


In [None]:
tab_train.columns

Index(['Explanation', 'Comment', 'Elaboration', 'Question-answer_pair',
       'Background', 'Correction', 'Alternation', 'Q-Elab', 'Parallel',
       'Acknowledgement', 'Continuation', 'Contrast', 'Narration',
       'Conditional', 'Clarification_question', 'Result'],
      dtype='object')

In [None]:
tab_test = [defaultdict(int) for _ in range(100000)] # keeps the dataframe counting the relaions handled by each utterance
n= 0
graph_test = []
test_labels = {}

for transcription_id in test_set:
    X_test = []


    # Read the JSON file for the transcription
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    for utterance in transcription:
        X_test.append(utterance["speaker"] + ": " + utterance["text"])
        n+=1

    # Read the text file for the discourse graph
    tab_test = [defaultdict(int) for _ in range(len(transcription))]
    graph_file_path_t = path_to_test / f"{transcription_id}.txt"
    edges = read_discourse_graph(graph_file_path_t)
    graph = create_graph(edges)
    graph_test.append(graph)
    for source , target , relation in edges:
        rel  =  relation['relation']
        tab_test[source][rel] +=1
        tab_test[target][rel] +=1

    X_test = e5.encode(X_test, show_progress_bar=True, normalize_embeddings=True)

    df_X_test = pd.DataFrame(X_test)

    matrice = []
    for dictionnaire in tab_test:
        ligne = [dictionnaire.get(cle, 0) for cle in ensemble_cles]
        matrice.append(ligne)

    # Creating the dataframe
    tab_test = pd.DataFrame(matrice, columns=ensemble_cles)

    # Concatenate tab_test and X_test along the columns
    X_testing = pd.concat([tab_test, df_X_test], axis=1)

    y_pred = model.predict(X_testing)
    test_labels[transcription_id] = y_pred.tolist()


with open("test_labels_text_submission25.json", "w") as file:
    json.dump(test_labels, file, indent=4)

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

Batches:   0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/52 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Batches:   0%|          | 0/46 [00:00<?, ?it/s]

In [None]:
!pip install jsonargparse

Collecting jsonargparse
  Downloading jsonargparse-4.27.1-py3-none-any.whl (189 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.7/189.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jsonargparse
Successfully installed jsonargparse-4.27.1


In [None]:
"""
This script converts test_labels.json into submission.csv
python make_submission.py --json_path test_labels_naive_baseline.json
"""
import json
from pathlib import Path


def make_submission(json_path: Path = Path("test_labels_text_submission25.json")):
    with open(json_path, "r") as file:
        test_labels = json.load(file)

    file = open("submission_with_augmentation_1.csv", "w")
    file.write("id,target_feature\n")
    for key, value in test_labels.items():
        u_id = [key + "_" + str(i) for i in range(len(value))]
        target = map(str, value)
        for row in zip(u_id, target):
            file.write(",".join(row))
            file.write("\n")
    file.close()

from jsonargparse import CLI

make_submission(Path("test_labels_text_submission25.json"))


# Essai avec graphSAGE


In [None]:
!pip install gensim karateclub


In [None]:
import networkx as nx
from gensim.models import Word2Vec
from karateclub import Graph2Vec

# Convert the NetworkX graphs to Karate Club graphs
karate_club_graphs = graph_training

# Create a Graph2Vec model
graph2vec = Graph2Vec(dimensions=128, workers=5 )

# Fit the model to the graphs
graph2vec.fit(karate_club_graphs )

# Get the embeddings for the graphs
embeddings = graph2vec.get_embedding()

# Print the embeddings
print(embeddings)


[[0.14302182 0.23317619 0.05371545 ... 0.02283959 0.08513803 0.0867016 ]
 [0.13697392 0.22593518 0.05524435 ... 0.01748195 0.10270845 0.07249808]
 [0.14688708 0.23179257 0.04766703 ... 0.00468148 0.10253199 0.09078921]
 ...
 [0.10584977 0.20510915 0.03467517 ... 0.01647012 0.05055625 0.05940546]
 [0.1328767  0.22810161 0.04147994 ... 0.01421704 0.08744928 0.07687352]
 [0.15119882 0.22695693 0.04040656 ... 0.0230514  0.08355169 0.07312427]]


In [None]:
import networkx as nx
from karateclub import GL2Vec

# Créer un objet GL2Vec
gl2vec = GL2Vec()

# Créer des embeddings pour chaque graphe dans graph_train
gl2vec.fit(graph_training)

embeddings =gl2vec.get_embedding()

# Print the embeddings
print(embeddings)


"""
# Créer un dictionnaire pour stocker les embeddings de chaque nœud
node_embeddings = {}

# Parcourir chaque graphe dans graph_train
for i, graph in enumerate(graph_train):
    # Obtenir les embeddings pour chaque nœud dans le graphe
    graph_embeddings = embeddings[i]
    # Parcourir chaque nœud dans le graphe
    for j, node in enumerate(graph.nodes()):
        # Obtenir l'embedding pour le nœud
        node_embedding = graph_embeddings[j]
        # Ajouter l'embedding au dictionnaire
        node_embeddings[node] = node_embedding

# Afficher les embeddings pour chaque nœud
for node, embedding in node_embeddings.items():
    print(f"Node {node}: {embedding}")
"""

[[ 0.21375628  0.23060526  0.15913025 ...  0.15594487  0.05980379
   0.19375704]
 [ 0.21305212  0.26441085  0.19979064 ...  0.11212713  0.10731754
   0.23530927]
 [ 0.1760897   0.25540245  0.25133795 ...  0.13790986  0.13346007
   0.26414394]
 ...
 [ 0.22077662  0.26507658  0.11908251 ...  0.14627613 -0.08107533
   0.14286284]
 [ 0.24453776  0.28491297  0.16851774 ...  0.18538831  0.07819755
   0.18487774]
 [ 0.2720212   0.21829014  0.16204007 ...  0.16181238  0.09926026
   0.15841421]]


'\n# Créer un dictionnaire pour stocker les embeddings de chaque nœud\nnode_embeddings = {}\n\n# Parcourir chaque graphe dans graph_train\nfor i, graph in enumerate(graph_train):\n    # Obtenir les embeddings pour chaque nœud dans le graphe\n    graph_embeddings = embeddings[i]\n    # Parcourir chaque nœud dans le graphe\n    for j, node in enumerate(graph.nodes()):\n        # Obtenir l\'embedding pour le nœud\n        node_embedding = graph_embeddings[j]\n        # Ajouter l\'embedding au dictionnaire\n        node_embeddings[node] = node_embedding\n\n# Afficher les embeddings pour chaque nœud\nfor node, embedding in node_embeddings.items():\n    print(f"Node {node}: {embedding}")\n'

In [None]:
pd.DataFrame(embeddings)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,0.213756,0.230605,0.159130,-0.226703,-0.259433,-0.088449,-0.073286,0.080588,-0.217814,-0.221701,...,0.112392,-0.203193,0.077168,0.123130,0.083104,0.241098,-0.054541,0.155945,0.059804,0.193757
1,0.213052,0.264411,0.199791,-0.142934,-0.101366,-0.158082,-0.168292,0.060441,-0.285126,-0.182693,...,0.090881,-0.161179,0.064663,0.129547,0.009256,0.304927,-0.033697,0.112127,0.107318,0.235309
2,0.176090,0.255402,0.251338,-0.102655,-0.055384,-0.178326,-0.219643,0.120345,-0.227018,-0.154460,...,0.063727,-0.105069,0.026108,0.084420,-0.024115,0.321727,0.051382,0.137910,0.133460,0.264144
3,0.255723,0.224499,0.267467,-0.157366,-0.090601,-0.175989,-0.191268,0.038997,-0.323348,-0.181523,...,0.128443,-0.175040,-0.020539,0.115214,-0.107678,0.232908,0.060072,0.165275,0.074861,0.262110
4,0.254283,0.227240,0.176377,-0.154012,-0.163636,-0.143654,-0.126307,0.079433,-0.220961,-0.225109,...,0.137663,-0.165847,0.030052,0.066917,-0.013609,0.263746,-0.003005,0.152781,0.104597,0.192510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0.196309,0.240250,0.199962,-0.139599,-0.220733,-0.165627,-0.075136,0.096338,-0.160023,-0.208723,...,0.085781,-0.135961,0.010646,0.081788,0.008143,0.168918,-0.046100,0.163653,0.055291,0.218981
93,0.243595,0.226882,0.219937,-0.151090,-0.100330,-0.174409,-0.196778,0.083500,-0.262903,-0.157149,...,0.085220,-0.132897,0.023546,0.060815,-0.085026,0.294273,0.050909,0.175694,0.166827,0.243097
94,0.220777,0.265077,0.119083,-0.298973,-0.331525,-0.062964,0.030269,0.115538,-0.111926,-0.257427,...,0.160688,-0.174388,0.085477,0.114908,0.146299,0.126214,-0.104139,0.146276,-0.081075,0.142863
95,0.244538,0.284913,0.168518,-0.230000,-0.243391,-0.120731,-0.078983,0.125872,-0.211081,-0.212432,...,0.144645,-0.119183,0.054064,0.086075,0.056922,0.219524,0.003488,0.185388,0.078198,0.184878


In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.4.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.4.0


# Code précédent

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
import networkx as nx

class GraphSAGEEmbedder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GraphSAGEEmbedder, self).__init__()
        self.conv1 = SAGEConv(input_size, hidden_size, normalize=True)
        self.fc1 = nn.Linear(hidden_size, output_size)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x = F.relu(self.fc1(x))
        return x

# Assuming you have a list of graphs in graph_train
graph_embeddings_list = []

# Create the GraphSAGEEmbedder model
input_size = 1  # Adjust as needed based on the characteristics of your graphs
hidden_size = 64  # Adjust as needed
output_size = 32  # Adjust as needed
graphSAGE_embedder = GraphSAGEEmbedder(input_size, hidden_size, output_size)

# Set the model to evaluation mode (no gradient computation)
graphSAGE_embedder.eval()

# Iterate through the list of NetworkX graphs
for graph in graph_training:
    # Convert NetworkX graph to PyTorch Geometric Data object
    edge_index = torch.tensor(list(graph.edges())).transpose(0, 1)
    edge_attr = torch.tensor([graph[edge[0]][edge[1]]['relation'] for edge in graph.edges() ], dtype=torch.float32).view(-1,1)
    x = torch.ones((len(graph.nodes()), 1), dtype=torch.float32)  # Dummy node feature
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

    # Get the embeddings for the current graph
    with torch.no_grad():
        embeddings = graphSAGE_embedder(data)

    graph_embeddings_list.append(embeddings)

# Now, graph_embeddings_list contains the embeddings for each graph in graph_train, considering edge labels


ValueError: ignored

In [None]:
graph_training[0].edges()

OutEdgeView([(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (5, 9), (6, 7), (7, 8), (9, 10), (9, 11), (9, 12), (12, 13), (13, 14), (13, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23), (23, 24), (24, 25), (25, 26), (26, 27), (26, 28), (26, 29), (29, 30), (29, 31), (30, 32), (32, 33), (33, 34), (33, 35), (35, 36), (36, 37), (37, 38), (38, 39), (38, 40), (38, 42), (40, 41), (41, 44), (42, 43), (44, 45), (45, 46), (46, 47), (46, 48), (48, 49), (49, 50), (50, 51), (51, 52), (52, 53), (52, 58), (53, 54), (53, 55), (53, 56), (56, 57), (58, 59), (59, 60), (60, 61), (60, 62), (62, 63), (63, 64), (63, 66), (64, 65), (66, 67), (67, 68), (68, 69), (68, 70), (70, 71), (70, 72), (72, 73), (72, 77), (73, 74), (74, 75), (75, 76), (76, 78), (78, 79), (78, 80), (78, 85), (80, 81), (81, 82), (82, 83), (83, 84), (85, 86), (85, 88), (86, 87), (87, 89), (88, 90), (90, 91), (91, 92), (91, 93), (93, 94), (94, 95), (94, 96), (95, 97), (96, 98), (97, 99), (99, 100), (100, 1

In [None]:
import json
from pathlib import Path

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/training")
path_to_test = Path("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/test")

#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

#####
# naive_baseline: all utterances are predicted important (label 1)
#####
test_labels = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    test_labels[transcription_id] = [1] * len(transcription)

with open("test_labels_naive_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)

## Text embedding

In [None]:
y_training = []
with open("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/training_labels.json", "r") as file:
    training_labels = json.load(file)
word_training = []
for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    for utterance in transcription:
        word_training.append(utterance["speaker"] + ": " + utterance["text"])

    y_training += training_labels[transcription_id]

#word_training = e5.encode(word_training, show_progress_bar=True, normalize_embeddings=True)
#word_training.shape

# RNN

In [None]:
!pip install datasets transformers



In [None]:
import time
time_start = time.time()

In [None]:
from datasets import load_dataset
import torch.utils.data as data
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader
import torch.optim as optim

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict



# Split the data into train, validation, and test sets
X_train, X_val, y_train, y_val = train_test_split(word_training, y_training, test_size=0.1, stratify=y_training, random_state=1)

# Create separate datasets for train, validation, and test
train_data = Dataset.from_dict({"text": X_train, "label": y_train})
val_data = Dataset.from_dict({"text": X_val, "label": y_val})
#test_data = Dataset.from_dict({"text": X_test, "label": y_test})


# Print the sizes of the three sets
print("Train set size:", len(train_data))
print("Validation set size:", len(val_data))
#print("Test set size:", len(test_data))


Train set size: 65360
Validation set size: 7263


In [None]:
print(train_data[0])

{'text': 'ID: yeah , yeah', 'label': 0}


In [None]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small-v2')

def collote_fn(batch_samples):
  batch_text = []
  batch_label = []
  for sample in batch_samples:
    batch_text.append(sample['text'])
    batch_label.append(int(sample['label']))
  X = tokenizer(
      batch_text,
      padding=True,
      truncation=True,
      return_tensors='pt'
  )
  y = torch.tensor(batch_label)
  return X,y

train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True, collate_fn=collote_fn)
val_dataloader = DataLoader(val_data, batch_size=128, shuffle=True, collate_fn=collote_fn)
#test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True, collate_fn=collote_fn)

In [None]:
from torch import nn

class LSTM(nn.Module):
  def __init__(self, hidden_size, output_size):
    super(LSTM, self).__init__()
    self.bert_encoder = AutoModel.from_pretrained("intfloat/e5-small-v2")
    self.hidden_size = hidden_size
    self.dropout = nn.Dropout(0.3)
    self.fc = nn.Linear(self.bert_encoder.config.hidden_size, 2)
  def forward(self, x):
    bert_outputs = self.bert_encoder(**x)
    outputs = bert_outputs[1]
    outputs = self.dropout(outputs)
    out = self.fc(outputs)
    return out

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTM(256,2).to(device)
print(model)

LSTM(
  (bert_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affi

In [None]:
from tqdm.auto import tqdm

def train(dataloader, model, loss_fun, optimizer, epoch, total_loss):
  process_bar = tqdm(range(len(dataloader)))
  process_bar.set_description(f'loss: {0:>7f}')
  finish_batch_num = (epoch-1)*len(dataloader)

  model.train()
  for batch, (X,y) in enumerate(dataloader, start=1):
    X,y = X.to(device),y.to(device)
    pred = model(X)
    loss = loss_fun(pred,y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
    process_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
    process_bar.update(1)
  return total_loss


def test(dataloader, model, mode='Test'):
    assert mode in ['Valid', 'Test']
    target_num = torch.zeros((1, 2)) # n_classes
    predict_num = torch.zeros((1, 2))
    acc_num = torch.zeros((1, 2))
    total_val_loss = 0
    avg_val_loss = 0

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = loss_fun(outputs,y)
            total_val_loss += loss.item()
            pred = outputs.argmax(1)
            # correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            pre_mask = torch.zeros(outputs.size()).scatter_(1, pred.cpu().view(-1, 1), 1.)
            predict_num += pre_mask.sum(0)
            tar_mask = torch.zeros(outputs.size()).scatter_(1, y.data.cpu().view(-1, 1), 1.)
            target_num += tar_mask.sum(0)
            acc_mask = pre_mask * tar_mask
            acc_num += acc_mask.sum(0)
        # calculate accuracy
        avg_val_loss = total_val_loss / len(dataloader)
        recall = torch.nan_to_num(acc_num / target_num, nan=0.0)
        precision = torch.nan_to_num(acc_num / predict_num, nan=0.0)
        F1 = torch.nan_to_num(2 * recall * precision / (recall + precision), nan=0.0)
        accuracy = 100. * acc_num.sum(1) / target_num.sum(1)

        avg_precision = torch.mean(precision)
        avg_recall = torch.mean(recall)
        avg_f1 = torch.mean(F1)

        print('{},loss{}, Acc {}, recal {}, precision {}, F1-score {}'.format(mode, avg_val_loss,accuracy.tolist(), avg_recall.tolist(), avg_precision.tolist(), avg_f1.tolist()))
    return avg_val_loss,accuracy,avg_precision,avg_recall,avg_f1

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
# Calculez les poids inverses de la fréquence des classe
class_weights = compute_class_weight('balanced', classes=np.unique(y_training), y=y_training)
class_weights

array([0.61201564, 2.73183118])

In [None]:
0.61201564 / (2.73183118+0.61201564)

0.183027415113471

In [None]:
learn_rate = 1e-5
num_epochs = 10
from transformers import AdamW
weights = torch.FloatTensor([0.61201564, 2.73183118]).to(device)
loss_fun = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learn_rate)

total_loss = 0
best_f1 = 0
best_loss = 15

import csv
with open('result.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Epoch','vail_loss','vail_accuracy','vail_precision', 'vail_recall', 'vail_F1-score'])

    for t in range(num_epochs):
      total_loss = train(train_dataloader, model, loss_fun, optimizer, t+1, total_loss)
      avg_val_loss,vaild_acc,vaild_pre,vaild_recall,vaild_f1 = test(val_dataloader,model,mode='Valid')
      writer.writerow([t+1,avg_val_loss,vaild_acc,vaild_pre, vaild_recall, vaild_f1])
      if vaild_f1 > best_f1:
        best_f1 = vaild_f1
        torch.save(model.state_dict(), 'best_model2.pt')
      if best_loss > avg_val_loss:
        best_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_model3.pt')



  0%|          | 0/511 [00:00<?, ?it/s]

Valid,loss0.3159865635006051, Acc [84.70329284667969], recal 0.6955899596214294, precision 0.7471155524253845, F1-score 0.7155462503433228


  0%|          | 0/511 [00:00<?, ?it/s]

Valid,loss0.3118456675295244, Acc [85.06127166748047], recal 0.7001164555549622, precision 0.7549918293952942, F1-score 0.7212315797805786


  0%|          | 0/511 [00:00<?, ?it/s]

Valid,loss0.31374610620632504, Acc [84.71705627441406], recal 0.7102723717689514, precision 0.7455253005027771, F1-score 0.7251794338226318


  0%|          | 0/511 [00:00<?, ?it/s]

Valid,loss0.31585043747174113, Acc [84.2764663696289], recal 0.7283053398132324, precision 0.736706554889679, F1-score 0.7323449850082397


  0%|          | 0/511 [00:00<?, ?it/s]

Valid,loss0.32455036410114224, Acc [84.15255737304688], recal 0.7114890813827515, precision 0.7342847585678101, F1-score 0.721680760383606


  0%|          | 0/511 [00:00<?, ?it/s]

Valid,loss0.3437617047314058, Acc [84.30400848388672], recal 0.7030731439590454, precision 0.7374759912490845, F1-score 0.7175654172897339


  0%|          | 0/511 [00:00<?, ?it/s]

Valid,loss0.36477580117551905, Acc [82.74817657470703], recal 0.7536960244178772, precision 0.7190502285957336, F1-score 0.7333043217658997


  0%|          | 0/511 [00:00<?, ?it/s]

Valid,loss0.3931727344006823, Acc [82.19744110107422], recal 0.7427346110343933, precision 0.7107038497924805, F1-score 0.7239514589309692


  0%|          | 0/511 [00:00<?, ?it/s]

Valid,loss0.3780801246563594, Acc [82.84455108642578], recal 0.7125352621078491, precision 0.7130331993103027, F1-score 0.7127835750579834


  0%|          | 0/511 [00:00<?, ?it/s]

Valid,loss0.42173624770683155, Acc [83.56050872802734], recal 0.7043623924255371, precision 0.7234626412391663, F1-score 0.7130100131034851


In [None]:
model.load_state_dict(torch.load('best_model3.pt'))
#avg_test_loss,test_acc,test_pre,test_recall,test_f1 = test(test_dataloader,model,mode='Test')

<All keys matched successfully>

## Model Classifier  

## Test

In [None]:
test_labels = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    X_test = []
    for utterance in transcription:
        X_test.append(utterance["speaker"] + ": " + utterance["text"])
        # Tokenize the text data
    X_test_encoded = tokenizer(X_test, padding=True, truncation=True, return_tensors="pt")

    # Move the encoded text data to the device
    X_test_encoded = {key: val.to(device) for key, val in X_test_encoded.items()}

    # Use the model to get predictions
    with torch.no_grad():
        model.eval()
        outputs = model(X_test_encoded)

    # Get the predicted labels
    predicted_labels = outputs.argmax(1)

    # Convert the tensor of predicted labels to a list
    predicted_labels = predicted_labels.tolist()

    test_labels[transcription_id] = predicted_labels

with open("test_labels_text_submission3.json", "w") as file:
    json.dump(test_labels, file, indent=4)

## Evaluation

In [None]:
len(test_labels)

40

## Submission

In [None]:
!pip install jsonargparse

Collecting jsonargparse
  Downloading jsonargparse-4.27.1-py3-none-any.whl (189 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.7/189.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jsonargparse
Successfully installed jsonargparse-4.27.1


In [None]:
"""
This script converts test_labels.json into submission.csv
python make_submission.py --json_path test_labels_naive_baseline.json
"""
import json
from pathlib import Path


def make_submission(json_path: Path = Path("test_labels_text_submission3.json")):
    with open(json_path, "r") as file:
        test_labels = json.load(file)

    file = open("submission_17_2.csv", "w")
    file.write("id,target_feature\n")
    for key, value in test_labels.items():
        u_id = [key + "_" + str(i) for i in range(len(value))]
        target = map(str, value)
        for row in zip(u_id, target):
            file.write(",".join(row))
            file.write("\n")
    file.close()

from jsonargparse import CLI

make_submission(Path("test_labels_text_submission3.json"))

In [None]:
best_params = {'alpha': 0.32518332202674705, 'colsample_bytree': 0.864803089169032, 'gamma': 0.6375574713552131, 'lambda': 3.6616382277289796, 'learning_rate': 0.019444298503238984, 'max_depth': 5, 'n_estimators': 2942, 'scale_pos_weight': 8.13244787222995, 'subsample': 0.8803925243084487}


In [None]:
best_params

{'alpha': 0.32518332202674705,
 'colsample_bytree': 0.864803089169032,
 'gamma': 0.6375574713552131,
 'lambda': 3.6616382277289796,
 'learning_rate': 0.019444298503238984,
 'max_depth': 5,
 'n_estimators': 2942,
 'scale_pos_weight': 8.13244787222995,
 'subsample': 0.8803925243084487}