In [1]:
import pickle, datetime
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, f1_score, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib import pyplot as plt
from tqdm import tqdm

In [2]:
df = pd.read_csv("../data/figlang_all.tsv", sep="\t", encoding="utf-8")
df = df[df['label'] != 0].reset_index()
print(df.shape)
print(df["label"].value_counts())
df.head()

(4342, 5)
1    2212
2     884
3     625
4     621
Name: label, dtype: int64


Unnamed: 0,index,text,label,label_binary,source
0,4,He pulls a giant disc out and flashes it like ...,3,1,Simile_hypothesis
1,9,Some bright young thing had gotten ahold of a ...,2,1,Idiom_hypothesis
2,13,"“I might be mistaken, but Sean's father looked...",2,1,Idiom_hypothesis
3,14,Her movements like a strange strip tease .,3,1,Simile_hypothesis
4,16,"I had to leave my childhood home, and am grate...",1,1,Sarcasm_hypothesis


In [3]:
RANDOM_STATE = 45

# 1. Transform the text to tf-idf vectors

In [4]:
tfidf_multi = TfidfVectorizer(ngram_range=(1, 1))

X = tfidf_multi.fit_transform(df['text']).toarray()
y = df['label']

print(X.shape)
print(y.shape)

(4342, 6891)
(4342,)


# 2. K-fold validation

In [5]:
def k_fold_validation(X, y, clf, k):
    kf = KFold(n_splits=k, shuffle=False)
    f1s = np.zeros(10)
    
    for i, (train_index, test_index) in tqdm(enumerate(kf.split(X)), total=k, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}'):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(x_train, y_train)
        preds = clf.predict(x_test)
        f1 = f1_score(y_test, preds, average='macro')     
        f1s[i] = f1
        
    print("Mean F1: ", f1s.mean())

## 2.1 Random forest

In [6]:
rf = RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=100)
k_fold_validation(X=X, y=y, clf=rf, k=10)

100%|██████████| 10/10 [00:44<00:00,  4.46s/it]

Mean F1:  0.8484869888615432





## 2.2 Logistic regression

In [7]:
lr = LogisticRegression(solver='sag', multi_class="multinomial", random_state=RANDOM_STATE)
k_fold_validation(X=X, y=y, clf=lr, k=10)

100%|██████████| 10/10 [02:29<00:00, 14.94s/it]

Mean F1:  0.8681840657103749





## 2.3 Decision tree

In [8]:
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
k_fold_validation(X=X, y=y, clf=dt, k=10)

100%|██████████| 10/10 [00:22<00:00,  2.22s/it]

Mean F1:  0.7673530712288912





## 2.4 Naive Bayes

In [9]:
nb = GaussianNB()
k_fold_validation(X=X, y=y, clf=nb, k=10)

100%|██████████| 10/10 [00:02<00:00,  3.52it/s]

Mean F1:  0.6782524290183078



