In [None]:
import datasets

snli = datasets.load_dataset('snli', split='train')

snli

In [None]:
mnli = datasets.load_dataset('glue', 'mnli', split = 'train')
mnli = mnli.remove_columns(['idx'])
mnli

In [None]:
dataset = datasets.concatenate_datasets([snli,mnli])
dataset

In [None]:
dataset = dataset.filter(lambda x: False if x['label'] != 0 else True)
dataset

In [None]:
from sentence_transformers import InputExample
 

train_samples = []
for row in dataset:
    train_samples.append(InputExample(texts=[row['premise'],row['hypothesis']]))

In [None]:
from sentence_transformers import datasets

batch_size = 16
loader = datasets.NoDuplicatesDataLoader(train_samples , batch_size = batch_size)

In [None]:
from sentence_transformers import models,SentenceTransformer

bert = models.Transformer('')
pooler = models.Pooling(bert.get_word_embedding_dimension(),pooling_mode_mean_tokens = True)

model = SentenceTransformer(modules=[bert,pooler])
model

In [None]:
from sentence_transformers import losses
loss = losses.MultipleNegativesRankingLoss(model=model)

In [None]:
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)
model.fit(train_objectives=[(loader, loss)],
          
          epochs = epochs,
          warmup_steps = warmup_steps,
      
          output_path='./MNR_biobert_snli_mnli'
          )

In [None]:
import pandas as pd
data = pd.read_pickle('')
data = data.sample(frac=1).reset_index(drop=True)
data_test = pd.read_csv('')
data_test = data_test.drop_duplicates()
data_test = data_test.reset_index(drop =True)
classes=data['Category'].unique().tolist()
nb_classes = len(classes)
print(nb_classes)
print(classes)
for i,type_c in enumerate(classes):
   for j,type_t in enumerate(data['Category']):
       if type_c == type_t :
           data.loc[j,'Category'] = i
for i,type_c in enumerate(classes):
   for j,type_t in enumerate(data_test['Category']):
       if type_c == type_t :
           data_test.loc[j,'Category'] = i
data['embeddings'] = data['text'].apply(model.encode)
data_test['embeddings'] = data_test['text'].apply(model.encode)

In [None]:
from sklearn.model_selection import train_test_split
X_train = data['embeddings'].to_list()
y_train = data['Category'].to_list()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)
X_test = data_test['embeddings'].to_list()
y_test = data_test['Category'].to_list()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

clfs = [
    ('LogisticRegression', LogisticRegression(max_iter=3000,
                                              class_weight='balanced')
    ),
    ('RandomForest', RandomForestClassifier(max_depth=18,
                                            n_estimators=75,
                                            random_state=0)
    ),
    ('KNN 5', KNeighborsClassifier(n_neighbors=4)
    ),
    ('SVM C1', SVC(C=1,
                   class_weight='balanced')
    )]

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score


def print_val_scores(scores: list[float]) -> None:

  print(f'Cross validation scores: mean: {np.mean(scores):.3f}, '
        f'all: {[round(score, 3) for score in scores]}')


def print_stratified_kfold(clfs: list[tuple[str, any]], X_train: pd.DataFrame,
                           y_train: pd.Series, n_splits: int = 5, cv: int = 5,
                           ) -> None:

  for clf in clfs:
    print(f'\nStratifiedKFold - classifier: {clf[0]}:\n')
    skf = StratifiedKFold(n_splits=n_splits)

    scores = cross_val_score(clf[1],
                            X_train,
                            y_train,
                            cv=cv)

    print_val_scores(scores)

In [None]:
print_stratified_kfold(clfs, X_train, y_train)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

clf =  SVC(C=1,
                   class_weight='balanced')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
# prob = clf.predict_proba(X_test)
# print(prob)

accuracy = np.mean(y_pred == y_test)

ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.title(f'SVM - acc {accuracy:.3f}', size=15)
plt.show()