In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("tifeeds_embeds_columns.csv")

In [3]:
X_pre_trained = df.filter(regex=("pre_trained.*"))
X_fine_tuned = df.filter(regex=("fine_tuned.*"))

In [4]:
# 768 embeddings
print(X_pre_trained.shape)
print(X_fine_tuned.shape)

(13859, 768)
(13859, 768)


In [4]:
def test_clf(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2022, test_size=0.3)
    clf = SVC()
    clf.fit(X_train, y_train)
    return accuracy_score(clf.predict(X_test), y_test)

In [5]:
# Test with pre_trained embeddings
print("Pre-trained acc: ", test_clf(X_pre_trained, df.label))

Pre-trained acc:  0.9126984126984127


In [6]:
# Test with fine-tuned embeddings
print("Fine-tuned acc: ", test_clf(X_fine_tuned, df.label))

Fine-tuned acc:  0.9564694564694565


In [7]:
def balance(df):
    pos = df[df['label'] == 1]
    neg = df[~df.index.isin(pos.index)]
    neg = neg.iloc[:pos.shape[0]]
    return pd.concat([pos,neg])

In [8]:
# Testing with balanced datasets

df_balanced = balance(df)
df_balanced['label'].value_counts()

0.0    2534
1.0    2534
Name: label, dtype: int64

In [9]:
X_pre_trained = df_balanced.filter(regex=("pre_trained.*"))
X_fine_tuned = df_balanced.filter(regex=("fine_tuned.*"))
print("Pre-trained: ", test_clf(X_pre_trained, df_balanced.label))
print("Fine-tuned: ", test_clf(X_fine_tuned, df_balanced.label))

Pre-trained:  0.9769888231426693
Fine-tuned:  0.980276134122288
