<a href="https://colab.research.google.com/github/AirNicco8/NLP_Unibo_Project_Work/blob/main/NLI_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Utils

In [1]:
!pip install datasets
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 6.6 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 53.9 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 55.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 44.9 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 47.0 MB/s 
Installing collected 

# Init

In [2]:
import math
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import random

import pandas as pd
import numpy as np

import torch

from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [4]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

# Data Preparation

### Download Dataset

In [5]:
!wget 'https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip'
!unzip './XNLI-1.0.zip'

--2022-10-23 14:37:15--  https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17865352 (17M) [application/zip]
Saving to: ‘XNLI-1.0.zip’


2022-10-23 14:37:17 (14.0 MB/s) - ‘XNLI-1.0.zip’ saved [17865352/17865352]

Archive:  ./XNLI-1.0.zip
   creating: XNLI-1.0/
  inflating: XNLI-1.0/.DS_Store      
   creating: __MACOSX/
   creating: __MACOSX/XNLI-1.0/
  inflating: __MACOSX/XNLI-1.0/._.DS_Store  
  inflating: XNLI-1.0/xnli.dev.tsv   
  inflating: __MACOSX/XNLI-1.0/._xnli.dev.tsv  
  inflating: XNLI-1.0/xnli.dev.jsonl  
  inflating: XNLI-1.0/README.md      
  inflating: __MACOSX/XNLI-1.0/._README.md  
  inflating: XNLI-1.0/xnli.test.jsonl  
  inflating: XNLI-1.0/xnli.test.tsv  
  inflating: __MACOSX/XNLI-1.0/._xnli.test.tsv  
  inflating: _

### Data Pre Processing

In [6]:
df = pd.read_table('/content/XNLI-1.0/xnli.test.tsv', header=0)
df = df.loc[df['language']=='de']

todrop = set(df.columns.values)-set(['gold_label','sentence1', 'sentence2'])
df = df.drop(columns=todrop) # keep onlt the columns we need

sen1_col = df.columns.values[1] 
sen2_col = df.columns.values[2] 
label_col = df.columns.values[0]

labels = np.unique(df.gold_label)
class_dict={}
for i,l in zip(range(len(labels)), labels):
  class_dict[l]=i # create a dict to map 'neutral', 'entailment', 'contradiction' to integers

print(class_dict)

{'contradiction': 0, 'entailment': 1, 'neutral': 2}


In [7]:
df.gold_label = df.gold_label.map(class_dict) # apply dict
df

Unnamed: 0,gold_label,sentence1,sentence2
10020,0,"Nun, daran dachte ich nicht einmal, aber ich w...",Ich habe nicht wieder mit ihm gesprochen.
10021,1,"Nun, daran dachte ich nicht einmal, aber ich w...","Ich war so mitgenommen, dass ich einfach wiede..."
10022,2,"Nun, daran dachte ich nicht einmal, aber ich w...",Wir hatten ein tolles Gespräch.
10023,2,"„Und ich dachte, das wäre ein Privileg, und da...","Mir war nicht bewusst, dass ich nicht die einz..."
10024,1,"„Und ich dachte, das wäre ein Privileg, und da...","Ich hatte den Eindruck, dass ich im AFCF Air F..."
...,...,...,...
15025,1,Davidson sollte nicht die Aussprache von Scone...,"Davidson sollte nicht so reden, dass Knochen u..."
15026,0,Davidson sollte nicht die Aussprache von Scone...,"Es wäre besser, wenn Davidson sich mit den Wör..."
15027,2,Der durchschnittliche Roman von 200.000 Wörter...,Ein 200.000 Wort Roman für 25 Dollar ist ein f...
15028,0,Der durchschnittliche Roman von 200.000 Wörter...,Ein 200.000 Wort Roman für $25 sind 4.000 Wört...


# Parallel Dataset

In [8]:
train_samples = []

x1_train = df[sen1_col].values.tolist()
x2_train = df[sen2_col].values.tolist()
y_train = df[label_col].values.tolist()

for i,j,k in zip(x1_train, x2_train, y_train):
  train_samples.append(InputExample(texts=[i, j], label=k))

In [9]:
from datasets import load_dataset # STS dataset for evaluation

dev_set = load_dataset("stsb_multi_mt",'de',split="dev")
test_set = load_dataset("stsb_multi_mt",'de',split="test")

Downloading builder script:   0%|          | 0.00/7.43k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/19.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.05k [00:00<?, ?B/s]

Downloading and preparing dataset stsb_multi_mt/de (download: 1.22 MiB, generated: 1.25 MiB, post-processed: Unknown size, total: 2.47 MiB) to /root/.cache/huggingface/datasets/stsb_multi_mt/de/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/270k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/61.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Dataset stsb_multi_mt downloaded and prepared to /root/.cache/huggingface/datasets/stsb_multi_mt/de/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9. Subsequent calls will reuse this data.




In [11]:
dev_samples = []

for i in dev_set:
  dev_samples.append(InputExample(texts=[i['sentence1'], i['sentence2']], label=float(i['similarity_score']) / 5.0)) # normalize score

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

test_samples = []

for i in test_set:
  test_samples.append(InputExample(texts=[i['sentence1'], i['sentence2']], label=float(i['similarity_score']) / 5.0)) # normalize score

# Multilanguage model

##Params

In [12]:
st_model = "airnicco8/xlm-roberta-en-it-de" #@param ["airnicco8/xlm-roberta-en-it-de","airnicco8/xlm-roberta-de"]
train_batch_size = 16 #@param ["8", "16", "32", "64", "128", "256", "512"]
num_epochs = 1 #@param ["1", "8", "16", "32", "64", "128", "256", "512"]

In [None]:
!mkdir output

In [None]:
model_save_path = './output/'

## Multilanguage Model Training

In [None]:
model = SentenceTransformer(st_model)

Downloading:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/698 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(class_dict))

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/314 [00:00<?, ?it/s]

## Multilanguage Model Evaluation

In [None]:
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

0.6412974138693235

In [None]:
test_df = pd.read_csv('/content/output/similarity_evaluation_sts-test_results.csv')

In [None]:
test_df

Unnamed: 0,epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
0,-1,-1,0.544815,0.641297,0.560545,0.621046,0.565209,0.623012,0.405455,0.431346
