In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime

import pandas as pd
from pathlib import Path
import torch
from tqdm import tqdm
import torch
from torch.autograd import Variable
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
from functools import partial
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score

from transformers import BertModel, BertTokenizer
from transformers import AdamW, get_constant_schedule_with_warmup, get_linear_schedule_with_warmup
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitModel, SetFitTrainer
import datasets
from datasets import Dataset, DatasetDict

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

DATA_FOLDER = Path('/home/data')

OUTPUT_FOLDER = Path('/home/output')
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)


In [2]:
df_train_a = pd.read_csv(DATA_FOLDER / 'subtaskA_train.csv')
df_train_b = pd.read_csv(DATA_FOLDER / 'subtaskB_train.csv')

df_test_a = pd.read_csv(DATA_FOLDER / 'subtaskA_test.csv')
df_test_b = pd.read_csv(DATA_FOLDER / 'subtaskB_test.csv')


df_a = pd.concat([
      pd.read_csv(DATA_FOLDER / 'subtaskA_test.csv'),
      pd.read_csv(DATA_FOLDER / 'subtaskA_train.csv'),
])

df_b = pd.concat([
      pd.read_csv(DATA_FOLDER / 'subtaskB_test.csv'),
      pd.read_csv(DATA_FOLDER / 'subtaskB_train.csv'),
])
df_test_a_with_labels = df_test_a.copy()
df_test_a_with_labels['conspiratorial'] = [1 if x in df_b.comment_text.to_list() else 0 for x in df_test_a_with_labels.comment_text]
df_test_a_with_labels.to_csv(DATA_FOLDER / 'subtaskA_test_label.csv', index=None)

df_train_augmented = pd.concat([
      pd.read_csv(DATA_FOLDER / 'paraphrase_davinci_003.csv'),
      pd.read_csv(DATA_FOLDER / 'subtaskA_train.csv'),
]).sample(frac=1.0)
df_train_augmented.drop(df_train_augmented[df_train_augmented.comment_text.isna()].index, inplace=True)
df_train_augmented = df_train_augmented.reset_index()
df_train_augmented.to_csv(DATA_FOLDER / 'subtaskA_train_augmented.csv', index=None) 

df_embeddings_augmented = pd.concat([
      pd.read_csv(DATA_FOLDER / 'embeddings/evalita_embeddings.csv'),
      pd.read_csv(DATA_FOLDER / 'embeddings/evalita_embeddings_augment.csv'),
])
df_embeddings_augmented.to_csv(DATA_FOLDER / 'evalita_embeddings_augment.csv', index=None) 

df_train_augmented_mt5 = pd.concat([
      pd.read_csv(DATA_FOLDER / 'paraphrase_mt5_base_all.csv'),
      pd.read_csv(DATA_FOLDER / 'subtaskA_train.csv'),
]).sample(frac=1.0)
df_train_augmented_mt5 = df_train_augmented_mt5.reset_index()


In [3]:
#@title Login to HuggingFace 🤗

#@markdown You need to accept the model license before downloading or using the Stable Diffusion weights. Please, visit the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work.
# https://huggingface.co/settings/tokens
!mkdir -p ~/.huggingface
HUGGINGFACE_TOKEN = "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" #@param {type:"string"}
!echo -n "{HUGGINGFACE_TOKEN}" > ~/.huggingface/token

In [4]:
MODEL_LIST = [
    'efederici/sentence-BERTino',
    'nickprock/sentence-bert-base-italian-uncased',
    'efederici/sentence-bert-base',
    'aiknowyou/aiky-sentence-bertino',
    'efederici/sentence-BERTino-3-64',
    'nickprock/sentence-bert-base-italian-xxl-uncased',
    'efederici/mmarco-sentence-BERTino',
    'efederici/sentence-it5-base',
    'efederici/sentence-it5-small',    
]


TRAIN_CONFIG = {
    "model_name": MODEL_LIST[5],
    "num_iterations": 10,
    "batch_size":28,
    "epoch": 1,
    "loss": CosineSimilarityLoss,
    "dataset": "vanilla",

}

model = SetFitModel.from_pretrained(TRAIN_CONFIG["model_name"])




model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [5]:
features = datasets.Features(
    { 
        '__index_level_0__': datasets.Value(dtype='int64'),
    'conspiratorial': datasets.ClassLabel(num_classes=2, names=[0,1]),
    'comment_text': datasets.Value('string')
    })
assert TRAIN_CONFIG["dataset"] in ["vanilla", "davinci", "mt5"]

d1 = None
d2 = None

if TRAIN_CONFIG["dataset"] == "vanilla":
  d2 = df_train_a.sample(frac=0.20, random_state=42)
  d1 = df_train_a[~df_train_a.index.isin(d2.index)]

if TRAIN_CONFIG["dataset"] == "davinci":
  d2 = df_train_augmented[df_train_augmented.comment_text_orig.isna()].sample(frac=0.20, random_state=42)
  d1 = df_train_augmented[~df_train_augmented.index.isin(d2.index)]

if TRAIN_CONFIG["dataset"] == "mt5":
# remove some to similar paraphrases
  df_train_augmented_mt5.drop(df_train_augmented_mt5[(df_train_augmented_mt5.bleu>0.7)].index, axis=0, inplace=True)
  df_train_augmented_mt5.drop(df_train_augmented_mt5[(df_train_augmented_mt5.bleu_r<0.80)|(df_train_augmented_mt5.bleu_r==77.0)].index, axis=0, inplace=True)

  d2 = df_train_augmented_mt5[df_train_augmented_mt5.comment_text_orig.isna()].sample(frac=0.20, random_state=42)
  d1 = df_train_augmented_mt5[~df_train_augmented_mt5.index.isin(d2.index)]

train_ds = Dataset.from_pandas(d1[['conspiratorial', 'comment_text']], features=features)
test_ds = Dataset.from_pandas(d2[['conspiratorial', 'comment_text']], features=features)


In [None]:
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    loss_class=CosineSimilarityLoss,
    batch_size=TRAIN_CONFIG["batch_size"],
    num_iterations=TRAIN_CONFIG["num_iterations"], # Number of text pairs to generate for contrastive learning (20 dadea mai bine)
    num_epochs=TRAIN_CONFIG["epoch"], # Number of epochs to use for contrastive learning,
    column_mapping={"comment_text": "text", "conspiratorial": "label"},

)

trainer.train()
metrics = trainer.evaluate()

TRAIN_RESULTS ={
    "validation_metric": metrics
}


print(f"Validation metrics: {metrics}")


Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/10 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 29480
  Num epochs = 1
  Total optimization steps = 1053
  Total train batch size = 28


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1053 [00:00<?, ?it/s]

In [None]:
BATCH_EVAL = 48

results = []
for x in tqdm(range(len(df_test_a_with_labels) // BATCH_EVAL)):
  rr = model(df_test_a_with_labels[x*BATCH_EVAL:(x+1)*BATCH_EVAL].comment_text.to_list())
  results.extend(rr.tolist())

TRAIN_RESULTS["classification_report"] = classification_report(df_test_a_with_labels['conspiratorial'], results) 
print(TRAIN_RESULTS["classification_report"])

output_subfolder = TRAIN_CONFIG["model_name"].replace('/','_').replace('-','_')
now = datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")

output_subfolder += timestamp

output_subfolder = OUTPUT_FOLDER / output_subfolder 
output_subfolder.mkdir(exist_ok=True)

logfile = output_subfolder / f'logfile_{timestamp}.log'
result_file = output_subfolder / f'submission_{timestamp}'

with open(logfile, "w") as f:
  f.write("Configuration: \n\n")
  for k in TRAIN_CONFIG:
    f.write(f"\t\t{k}: \t {TRAIN_CONFIG[k]}\n")
  f.write("\n\n\n\n")
  f.write("Train Results: \n\n")
  for k in TRAIN_RESULTS:
    f.write(f"\t{k}\n")
    f.write(f"\t\t{TRAIN_RESULTS[k]}\n")
  
  f.write(f"\n{result_file.name}\n")


df_test_a['Expected'] = results
df_test_a[['Id', 'Expected']].to_csv(result_file, index=None)


