# Data Challenge INF 554

## Preprocessing

In [None]:

import json
from pathlib import Path

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/training")
path_to_test = Path("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/test")

#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])


## Data extraction and data augmentation

In [None]:

y_training = []
with open("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/training_labels.json", "r") as file:
    training_labels = json.load(file)
word_training = []
for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    for utterance in transcription:
        word_training.append(utterance["speaker"] + ": " + utterance["text"])

    y_training += training_labels[transcription_id]


In [None]:

# Adding data by 2 by 2 merging
word_training_augmented = list(word_training)
y_training_augmented = list(y_training)

i = 0
while i < len(y_training) - 1:
    if y_training[i] == 1:
        j = i+1
        while j < len(y_training) and y_training[j] == 0:
            j+=1
        if j < len(y_training):
            new_utterance = word_training[i] + ' ' + word_training[j]
            word_training_augmented.append(new_utterance)
            y_training_augmented.append(1)

        i = j
    i+=1

# BERT + LSTM

In [None]:
!pip install datasets transformers



In [None]:
from datasets import load_dataset
import torch.utils.data as data
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader
import torch.optim as optim

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict



# Split the data into train, validation, and test sets
X_train, X_val, y_train, y_val = train_test_split(word_training_augmented, y_training_augmented, test_size=0.05, stratify=y_training_augmented, random_state=42)

# Create separate datasets for train, validation, and test
train_data = Dataset.from_dict({"text": X_train, "label": y_train})
val_data = Dataset.from_dict({"text": X_val, "label": y_val})
#test_data = Dataset.from_dict({"text": X_test, "label": y_test})


# Print the sizes of the three sets
print("Train set size:", len(train_data))
print("Validation set size:", len(val_data))
#print("Test set size:", len(test_data))


Train set size: 75305
Validation set size: 3964


In [None]:
print(train_data[0])

{'text': 'PM: Right .', 'label': 0}


## Text embedding using BERT tokenizer throw HuggingFace checkpoint

In [None]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-base-v2')

def collote_fn(batch_samples):
  batch_text = []
  batch_label = []
  for sample in batch_samples:
    batch_text.append(sample['text'])
    batch_label.append(int(sample['label']))
  X = tokenizer(
      batch_text,
      padding=True,
      truncation=True,
      return_tensors='pt'
  )
  y = torch.tensor(batch_label)
  return X,y

train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collote_fn)
val_dataloader = DataLoader(val_data, batch_size=64, shuffle=True, collate_fn=collote_fn)
#test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True, collate_fn=collote_fn)

## Model architecture conception

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from torch import nn

class Bert_lstm(nn.Module):
  def __init__(self, hidden_size, output_size):
    super(Bert_lstm, self).__init__()
    self.bert_encoder = AutoModel.from_pretrained("intfloat/e5-base-v2")
    self.hidden_size = hidden_size
    self.lstm = nn.LSTM(self.bert_encoder.config.hidden_size, self.hidden_size, batch_first=True)
    self.fc = nn.Linear(self.hidden_size, 2)
  def forward(self, x):
    bert_outputs = self.bert_encoder(**x)
    last_hidden_states = bert_outputs.last_hidden_state
    outputs, hidden = self.lstm(last_hidden_states)
    out = self.fc(outputs[:, -1, :])
    return out


model = Bert_lstm(256,2).to(device)
print(model)

LSTM(
  (bert_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

## Train and evaluate

In [None]:
from tqdm.auto import tqdm

def train(dataloader, model, loss_fun, optimizer, epoch, total_loss):
  process_bar = tqdm(range(len(dataloader)))
  process_bar.set_description(f'loss: {0:>7f}')
  finish_batch_num = (epoch-1)*len(dataloader)

  model.train()
  for batch, (X,y) in enumerate(dataloader, start=1):
    X,y = X.to(device),y.to(device)
    pred = model(X)
    loss = loss_fun(pred,y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
    process_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
    process_bar.update(1)
  return total_loss


def test(dataloader, model, mode='Test'):
    assert mode in ['Valid', 'Test']
    target_num = torch.zeros((1, 2)) # n_classes
    predict_num = torch.zeros((1, 2))
    acc_num = torch.zeros((1, 2))
    total_val_loss = 0
    avg_val_loss = 0

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = loss_fun(outputs,y)
            total_val_loss += loss.item()
            pred = outputs.argmax(1)
            # correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            pre_mask = torch.zeros(outputs.size()).scatter_(1, pred.cpu().view(-1, 1), 1.)
            predict_num += pre_mask.sum(0)
            tar_mask = torch.zeros(outputs.size()).scatter_(1, y.data.cpu().view(-1, 1), 1.)
            target_num += tar_mask.sum(0)
            acc_mask = pre_mask * tar_mask
            acc_num += acc_mask.sum(0)
        # calculate accuracy
        avg_val_loss = total_val_loss / len(dataloader)
        recall = torch.nan_to_num(acc_num / target_num, nan=0.0)
        precision = torch.nan_to_num(acc_num / predict_num, nan=0.0)
        F1 = torch.nan_to_num(2 * recall * precision / (recall + precision), nan=0.0)
        accuracy = 100. * acc_num.sum(1) / target_num.sum(1)

        avg_precision = torch.mean(precision)
        avg_recall = torch.mean(recall)
        avg_f1 = torch.mean(F1)

        print('{},loss{}, Acc {}, recal {}, precision {}, F1-score {}'.format(mode, avg_val_loss,accuracy.tolist(), avg_recall.tolist(), avg_precision.tolist(), avg_f1.tolist()))
    return avg_val_loss,accuracy,avg_precision,avg_recall,avg_f1

## Class weights

In [None]:

from sklearn.utils.class_weight import compute_class_weight
import numpy as np
# Calculez les poids inverses de la fréquence des classes
class_weights = compute_class_weight('balanced', classes=np.unique(y_training_augmented), y=y_training_augmented)
class_weights

array([0.66802346, 1.98788745])

## Define Early stopping strategy

In [None]:
import torch
from torch import nn

class EarlyStopping:
    def __init__(self, patience=5, delta=0, verbose=False, path='checkpoint.pt'):
        self.patience = patience
        self.delta = delta
        self.verbose = verbose
        self.path = path
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss


## Training

In [None]:

learn_rate = 1e-5
num_epochs = 10

weights = torch.FloatTensor([0.66802346, 1.98788745]).to(device)
loss_fun = nn.CrossEntropyLoss(weight = weights)
optimizer = optim.AdamW(model.parameters(), lr=learn_rate,weight_decay=1.2)

total_loss = 0
best_f1 = 0
best_loss = 0.7
early_stopping = EarlyStopping(patience=2, verbose=True)
import csv
with open('result.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Epoch','vail_loss','vail_accuracy','vail_precision', 'vail_recall', 'vail_F1-score'])

    for t in range(num_epochs):
      total_loss = train(train_dataloader, model, loss_fun, optimizer, t+1, total_loss)
      avg_val_loss,vaild_acc,vaild_pre,vaild_recall,vaild_f1 = test(val_dataloader,model
                                                                    ,mode='Valid')
      writer.writerow([t+1,avg_val_loss,vaild_acc,vaild_pre, vaild_recall, vaild_f1])
      early_stopping(avg_val_loss, model)

      if early_stopping.early_stop:
        print("Early stopping")
        break

  0%|          | 0/1177 [00:00<?, ?it/s]

Valid,loss0.3424157048425367, Acc [79.76791381835938], recal 0.8355439901351929, precision 0.7612415552139282, F1-score 0.7714002132415771
Validation loss decreased (inf --> 0.342416).  Saving model ...


  0%|          | 0/1177 [00:00<?, ?it/s]

Valid,loss0.3384060626549105, Acc [80.87789916992188], recal 0.8392961025238037, precision 0.767810046672821, F1-score 0.7810805439949036
Validation loss decreased (0.342416 --> 0.338406).  Saving model ...


  0%|          | 0/1177 [00:00<?, ?it/s]

Valid,loss0.3378065843735972, Acc [81.58425903320312], recal 0.8353570699691772, precision 0.770062267780304, F1-score 0.7855423092842102
Validation loss decreased (0.338406 --> 0.337807).  Saving model ...


  0%|          | 0/1177 [00:00<?, ?it/s]

Valid,loss0.4233021147308811, Acc [84.3592300415039], recal 0.8249247074127197, precision 0.790749192237854, F1-score 0.8044216632843018
EarlyStopping counter: 1 out of 2


  0%|          | 0/1177 [00:00<?, ?it/s]

Valid,loss0.4691635002532313, Acc [81.71039581298828], recal 0.807563066482544, precision 0.7627055644989014, F1-score 0.7778096795082092
EarlyStopping counter: 2 out of 2
Early stopping


## Load best model checkpoint from the early stopping strategy

In [None]:
model.load_state_dict(torch.load('checkpoint.pt'))
#avg_test_loss,test_acc,test_pre,test_recall,test_f1 = test(test_dataloader,model,mode='Test')

<All keys matched successfully>

## Classification on the Test set

In [None]:
test_labels = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    X_test = []
    for utterance in transcription:
        X_test.append(utterance["speaker"] + ": " + utterance["text"])
        # Tokenize the text data
    X_test_encoded = tokenizer(X_test, padding=True, truncation=True, return_tensors="pt")

    # Move the encoded text data to the device
    X_test_encoded = {key: val.to(device) for key, val in X_test_encoded.items()}

    # Use the model to get predictions
    with torch.no_grad():
        model.eval()
        outputs = model(X_test_encoded)

    # Get the predicted labels
    predicted_labels = outputs.argmax(1)

    # Convert the tensor of predicted labels to a list
    predicted_labels = predicted_labels.tolist()

    test_labels[transcription_id] = predicted_labels

with open("test_labels_text_submission3.json", "w") as file:
    json.dump(test_labels, file, indent=4)

In [None]:
len(test_labels)

## Submission

In [None]:
!pip install jsonargparse



In [None]:
"""
This script converts test_labels.json into submission.csv
python make_submission.py --json_path test_labels_naive_baseline.json
"""
import json
from pathlib import Path


def make_submission(json_path: Path = Path("test_labels_text_submission3.json")):
    with open(json_path, "r") as file:
        test_labels = json.load(file)

    file = open("submission_double_size_1-2v2.csv", "w")
    file.write("id,target_feature\n")
    for key, value in test_labels.items():
        u_id = [key + "_" + str(i) for i in range(len(value))]
        target = map(str, value)
        for row in zip(u_id, target):
            file.write(",".join(row))
            file.write("\n")
    file.close()

from jsonargparse import CLI

make_submission(Path("test_labels_text_submission3.json"))