In [1]:
%pip install transformers
%pip install setfit
%pip install pyarrow

from setfit import SetFitModel, SetFitTrainer
import pandas as pd
import pyarrow as pa
import datasets
import pandas as pd
from os import walk, makedirs
from google.colab import files

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting filelock
  Using cached filelock-3.9.0-py3-none-any.whl (9.7 kB)
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Collecting tqdm>=4.27
  Using cached tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
^C
[31mERROR: Operation cancelled 

##### output file genertic fuction

In [2]:
def predictions_to_csv(test_text: list, test_class: list, predictions: list, k: int, train_filename: str, num_epoch: int, pred_output_dir: str):
  # output predictions to csv files.
  output_df = pd.DataFrame()
  output_df["text"] = test_text
  output_df["class"] = test_class
  # add new predictions as a 3rd column
  # dataframe has these columns (test, class, prediction).
  output_df["predicted"] = predictions 

  makedirs(f"predictions/k_is_{k}/{pred_output_dir}/", exist_ok=True)
  output_df.to_csv(f"predictions/k_is_{k}/{pred_output_dir}/{num_epoch}epoch_{train_filename}", index=False)

In [9]:
def setfit_on_pretrained(test_text: list, test_class: list, train_text: list, train_class: list,
                        train_filename: str, k: int, pretrainied_model_string: str, num_epoch:int) -> None:

  train_dataset = datasets.Dataset(pa.Table.from_arrays([train_text, train_class], ["text","class"]))

  model = SetFitModel.from_pretrained(pretrainied_model_string)

  # total training data for each network is 320 per epoch
  # because training data size fluctuates, we need to generate equal number for setfit pairs to have fair training.
  # this way we are effectively testing if diversity in data matters.
  # for traintest2 : 8*x*2 = 320 , 160/8 = x  <- same math
  num_iter = round(160/len(train_text))

  trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    metric="accuracy",
    batch_size=16,
    num_iterations=num_iter, # The number of text pairs to generate for contrastive learning
    num_epochs=num_epoch, # The number of epochs to use for contrastive learning
    column_mapping={"text": "text", "class": "label"} # Map dataset columns to text/label expected by trainer
  )
  # train network
  trainer.train()
  # predict with network
  predictions = trainer.model.predict(test_text)

  # output filename
  modelnamesplit = pretrainied_model_string.split('/')
  modelname = modelnamesplit[len(modelnamesplit)-1]

  predictions_to_csv(test_text,
                    test_class,
                    predictions,
                    k,
                    train_filename,
                    num_epoch,
                    modelname
                    )


##### function to read all data in from data_ready/

In [2]:
def read_data_for_fold(k: int) -> dict:
  data_dict = {}
  k_fold_dir = f"data_ready/k_is_{str(k)}/"
  # print (os.path(data_ready_dir))
  filenames = []
  for (_, _, name) in walk(k_fold_dir): 
    filenames.extend(name)

  for fname in filenames:
    dataframe = pd.read_csv(k_fold_dir+fname).sample(frac=1)
    dict_key = fname.split(".csv")[0]
    data_dict.update({dict_key : dataframe})

  return data_dict

### Main Driver

In [None]:
modelnames = ['cardiffnlp/bert-base-multilingual-cased-sentiment-multilingual',
              'Hate-speech-CNERG/dehatebert-mono-french',
              'morit/french_xlm_xnli',
              'cardiffnlp/xlm-roberta-base-sentiment-multilingual']


for modelname in modelnames:
  for k in range(0,3): # this will run for every fold we have data for.
    data_dict = read_data_for_fold(k)
    test = data_dict.pop("test")
    test_text = list(test["text"]) # test data X
    test_class = list(test["class"]) # test data Y
    for filename in data_dict.keys():
      train_text = list(data_dict.get(filename)["text"]) # train data X
      train_class = list(data_dict.get(filename)["class"]) # train data Y
      for epochs in range(1,6):
        setfit_on_pretrained(
          test_text,
          test_class,
          train_text,
          train_class,
          filename,
          k,
          modelname,
          epochs
        )