In [39]:
from os import walk, makedirs
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
import fasttext
import fasttext.util
from sentence_transformers import SentenceTransformer

##### function to read all data in from data_ready/  
data is already split in to k folds

In [40]:
def read_data_for_fold(k: int) -> dict:
  data_dict = {}
  k_fold_dir = f"../data_ready/k_is_{str(k)}/"
  # print (os.path(data_ready_dir))
  filenames = []
  for (_, _, name) in walk(k_fold_dir): 
    filenames.extend(name)

  for fname in filenames:
    dataframe = pd.read_csv(k_fold_dir+fname).sample(frac=1)
    dict_key = fname.split(".csv")[0]
    data_dict.update({dict_key : dataframe})

  return data_dict

##### output file genertic fuction ####

In [41]:
def predictions_to_csv(test_text: list, test_class: list, predictions: list, k: int, train_filename: str, pred_output_dir: str):
  # output predictions to csv files.
  output_df = pd.DataFrame()
  output_df["text"] = test_text
  output_df["class"] = test_class
  # add new predictions as a 3rd column
  # dataframe has these columns (test, class, prediction).
  output_df["predicted"] = predictions 

  makedirs(f"../predictions/k_is_{k}/{pred_output_dir}/", exist_ok=True)
  output_df.to_csv(f"../predictions/k_is_{k}/{pred_output_dir}/{train_filename}", index=False)

##### Baseline test function  
this is called once for every train/test pair

a note on the train files,  
train2.csv contains 8 train data because it contains...
- (one english inclusive
- one english exclusive
- one french inclusive
- one french exclusive) * 2 = 8  
  
this information does not affect anything in this function, it is just good to know.

using these extremely small datasets shows the power of fewshot with setfit.  
For the fewshot i have done, we get 90%+ with this small data.  
hopefully these baselines predict very bad.

In [42]:
def do_baseline_tests(test_text: list, test_class: list, train_text: list, train_class: list, k: int, train_filename: str):
  ########################## begin basline tests ##########################
  """
    Baseline models here. :)
    test_text, test_class, train_text, train_class are lists
    classes are balanced.
  """
  ####  TF-IDF encodings  ####
  Tfidf = TfidfVectorizer(max_features=15000)
  train_tfidf = Tfidf.fit_transform(train_text).toarray()
  test_tfidf = Tfidf.transform(test_text).toarray()
  ############################
  # #### fastText encodings ####
  # fasttext.util.download_model('en', if_exists='ignore')  # english
  # T_ft = fasttext.load_model('cc.en.300.bin')

  # T_ft.get_dimension()
  # fasttext.util.reduce_model(T_ft, 100)
  ############################
  ####   BERT encodings   ####
  sbert_model = SentenceTransformer('bert-base-multilingual-uncased')
  train_BertEmbeddings = sbert_model.encode(train_text)
  test_BertEmbeddings = sbert_model.encode(test_text)
  ############################

  ### Gaussian NaiveBayes ###
  NaiveBayes = GaussianNB()
  ## TF-IDF ##
  NaiveBayes.fit(train_tfidf,train_class)
  pred_NB_TFidf = NaiveBayes.predict(test_tfidf)
  predictions_to_csv(test_text, test_class, pred_NB_TFidf, k, train_filename, "TFidf/GaussianNB")
  ## BERT ##
  NaiveBayes.fit(train_BertEmbeddings,train_class)
  pred_NB_Bert = NaiveBayes.predict(test_BertEmbeddings)
  predictions_to_csv(test_text, test_class, pred_NB_Bert, k, train_filename, "Bert/GaussianNB")

  ### Linear SVC ###
  LinearSVM = LinearSVC(C=1.0)
  ## TF-IDF ##
  LinearSVM.fit(train_tfidf, train_class)
  pred_SVM_TFidf = LinearSVM.predict(test_tfidf)
  predictions_to_csv(test_text, test_class, pred_SVM_TFidf, k, train_filename, "TFidf/LinearSVM")
  ## BERT ##
  LinearSVM.fit(train_tfidf, train_class)
  pred_SVM_Bert = LinearSVM.predict(test_tfidf)
  predictions_to_csv(test_text, test_class, pred_SVM_Bert, k, train_filename, "Bert/LinearSVM")

  ### Logistic Regression ###
  lr = LogisticRegression(C=11,class_weight = 'balanced')
  ## TF-IDF ##
  lr.fit(train_tfidf,train_class)
  pred_LR_TFidf = lr.predict(test_tfidf)
  predictions_to_csv(test_text, test_class, pred_LR_TFidf, k, train_filename, "TFidf/LogisticRegression")
  ## BERT ##
  lr.fit(train_tfidf,train_class)
  pred_LR_Bert = lr.predict(test_tfidf)
  predictions_to_csv(test_text, test_class, pred_LR_Bert, k, train_filename, "Bert/LogisticRegression")

  ### Random Forest ###
  sc_X = StandardScaler(with_mean=False)
  X_Train_RF = sc_X.fit_transform(train_tfidf)
  X_test_RF = sc_X.transform(test_tfidf)

  ## TF-IDF ##
  classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
  classifier.fit(X_Train_RF,train_class)
  pred_RF_TFidf = classifier.predict(X_test_RF)
  predictions_to_csv(test_text, test_class, pred_RF_TFidf, k, train_filename, "TFidf/RandomForest")

  ## BERT ##
  classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
  classifier.fit(X_Train_RF,train_class)
  pred_RF_Bert = classifier.predict(X_test_RF)
  predictions_to_csv(test_text, test_class, pred_RF_Bert, k, train_filename, "Bert/RandomForest")

  ### DecisionTree ###
  clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)
  ## TF-IDF ##
  clf = clf.fit(train_tfidf, train_class)
  Pred_DT_TFidf = clf.predict(test_tfidf)
  predictions_to_csv(test_text, test_class, Pred_DT_TFidf, k, train_filename, "TFidf/DecisionTree")
  ## BERT ##
  clf = clf.fit(train_tfidf, train_class)
  Pred_DT_Bert = clf.predict(test_tfidf)
  predictions_to_csv(test_text, test_class, Pred_DT_Bert, k, train_filename, "Bert/DecisionTree")

  ### XGboost ###
  classifier = SGDClassifier()
  ## TF-IDF ##
  classifier.fit(train_tfidf , np.ravel(train_class))
  pred_XG_TFidf = classifier.predict(test_tfidf)
  predictions_to_csv(test_text, test_class, pred_XG_TFidf, k, train_filename, "TFidf/XGboost")
  ## BERT ##
  classifier.fit(train_tfidf , np.ravel(train_class))
  pred_XG_Bert = classifier.predict(test_tfidf)
  predictions_to_csv(test_text, test_class, pred_XG_Bert, k, train_filename, "Bert/XGboost")

##### driver function for all baseline tests.

In [43]:
# for k in range(0,3): # this will run for every fold we have data for.
data_dict = read_data_for_fold(0)
# print(data_dict.keys())
# for value in data_dict.values():
#   print(value.shape)
test = data_dict.pop("test")
test_text = list(test["text"]) # test data X
test_class = list(test["class"]) # test data Y

for key in data_dict.keys():
  train_text = list(data_dict.get(key)["text"]) # train data X
  train_class = list(data_dict.get(key)["class"]) # train data Y
  do_baseline_tests(test_text, test_class, train_text, train_class, 0 ,key)
  print(f"baselines done for {key} in fold {0}")


No sentence-transformers model found with name /home/blake/.cache/torch/sentence_transformers/bert-base-multilingual-uncased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/blake/.cache/torch/sentence_transformers/bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSeque

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling `cublasSgemmStridedBatched( handle, opa, opb, m, n, k, &alpha, a, lda, stridea, b, ldb, strideb, &beta, c, ldc, stridec, num_batches)`