In [23]:
%matplotlib inline

import bz2
import json
# import matplotlib.pyplot as plt
import pandas as pd
# import seaborn as sns

# from collections import Counter
# from lime.lime_text import LimeTextExplainer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

import pickle

In [24]:
RANDOM_SEED = 42

In [25]:
train_df = pd.read_csv('assets/sarcasm.train.tsv.gz', sep='\t', compression='gzip').dropna()
test_imb_df = pd.read_csv('assets/sarcasm.test-imb.tsv.gz', sep='\t', compression='gzip').dropna()
test_bal_df = pd.read_csv('assets/sarcasm.test-bal.tsv.gz', sep='\t', compression='gzip').dropna()

In [26]:
y_train = train_df['label']
y_test_imb = test_imb_df['label']
y_test_bal = test_bal_df['label'] 

In [27]:
vectorizer = TfidfVectorizer(min_df=100, stop_words=ENGLISH_STOP_WORDS, ngram_range=(1,4))

X_train = vectorizer.fit_transform(train_df['text'])
X_test_bal = vectorizer.transform(test_bal_df['text'])
X_test_imb = vectorizer.transform(test_imb_df['text'])

In [28]:
lr_clf = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=RANDOM_SEED)
rf_clf = RandomForestClassifier(n_estimators=50, max_depth=15, random_state=RANDOM_SEED)

In [41]:
print('started fitting')
lr_clf = lr_clf.fit(X_train, y_train)
rf_clf = rf_clf.fit(X_train, y_train)
print('finished fitting')

started fitting
finished fitting


In [30]:
random_clf = DummyClassifier(strategy='uniform', random_state=RANDOM_SEED).fit(X_train, y_train)

## Predict and score

In [31]:
y_pred_bal = lr_clf.predict(X_test_bal)
y_pred_imb = lr_clf.predict(X_test_imb)
rf_y_pred_bal = rf_clf.predict(X_test_bal)
rf_y_pred_imb = rf_clf.predict(X_test_imb)
random_y_pred_bal = random_clf.predict(X_test_bal)
random_y_pred_imb = random_clf.predict(X_test_imb)

In [32]:
lr_bal_f1 = f1_score(y_test_bal, y_pred_bal)
lr_imb_f1 = f1_score(y_test_imb, y_pred_imb)
rf_bal_f1 = f1_score(y_test_bal, rf_y_pred_bal) 
rf_imb_f1 = f1_score(y_test_imb, rf_y_pred_imb) 
rand_bal_f1 = f1_score(y_test_bal, random_y_pred_bal) 
rand_imb_f1 = f1_score(y_test_imb, random_y_pred_imb)

In [33]:
print(lr_imb_f1)
print(rf_imb_f1) 
print(rand_imb_f1)
print(lr_bal_f1) 
print(rf_bal_f1) 
print(rand_bal_f1)

0.09441458712139227
0.11707638410635686
0.05049984866759725
0.6045505212772933
0.45640737737300646
0.501399563892798


## Save models

In [45]:
from joblib import dump, load

In [46]:
dump(vectorizer, '././models/sarcasm_vectorizer.joblib') 

['././models/sarcasm_vectorizer.joblib']

In [47]:
dump(lr_clf, "././models/logreg_clf.pkl") 

['././models/logreg_clf.pkl']

In [48]:
dump(rf_clf, "././models/rf_clf.pkl") 

['././models/rf_clf.pkl']