In [32]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score


# Data Summary

In [2]:
dataset = pd.read_csv("./data/dataset.csv")
dataset.head()

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"Pasco police shot mexican migrant from behind,...",False


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    200000 non-null  object
 1   humor   200000 non-null  bool  
dtypes: bool(1), object(1)
memory usage: 1.7+ MB


In [4]:
# Lower all letters
dataset["text"] = dataset["text"].str.lower()
# Remove punctuations
dataset["text"] =  dataset['text'].str.replace('[^\w\s]', '', regex=True)
# Remove numbers
dataset['text'] = dataset['text'].str.replace('\d', '', regex=True)
dataset.head()

Unnamed: 0,text,humor
0,joe biden rules out bid guys im not running,False
1,watch darvish gave hitter whiplash with slow p...,False
2,what do you call a turtle without its shell dead,True
3,reasons the election feels so personal,False
4,pasco police shot mexican migrant from behind ...,False


In [5]:
# Remove stop words from dataset
stop_words = stopwords.words('english')
dataset['text'] = dataset['text'].apply(lambda x: " ".join(x for x in str(x).split() if x not in stop_words))
dataset.head()

Unnamed: 0,text,humor
0,joe biden rules bid guys im running,False
1,watch darvish gave hitter whiplash slow pitch,False
2,call turtle without shell dead,True
3,reasons election feels personal,False
4,pasco police shot mexican migrant behind new a...,False


In [6]:
# Rare words are eliminated
# Rare word definition: count of word will be less then quantile(.25)
freq_words = pd.Series(" ".join(dataset['text']).split()).value_counts()
freq_filter = freq_words[freq_words<=freq_words.quantile(.25)]
dataset["text"] = dataset["text"].apply(lambda x: " ".join(x for x in x.split() if x not in freq_filter))
dataset.head()

Unnamed: 0,text,humor
0,joe biden rules bid guys im running,False
1,watch gave hitter whiplash slow pitch,False
2,call turtle without shell dead,True
3,reasons election feels personal,False
4,police shot mexican migrant behind new autopsy...,False


In [23]:
dataset["humor"] = dataset["humor"].astype(int)
X_train, X_test, y_train, y_test = train_test_split(dataset["text"], dataset["humor"], test_size=0.2, random_state=5)

# Count Vectorization

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

array(['__', '_____', '______', ..., 'zurich', 'zzs', 'zzzs'],
      dtype=object)

In [25]:
from sklearn.linear_model import LogisticRegression

In [29]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vectorized, y_train)

LogisticRegression(max_iter=1000)

In [34]:
preds = lr.predict(X_test_vectorized)
print(f"Test Accuracy: {accuracy_score(y_test, preds)}")
print(f"Test ROC-AUC Score: {roc_auc_score(y_test, preds)}")
print(f"Test F1 Score: {f1_score(y_test, preds)}")


Test Accuracy: 0.911275
Test ROC-AUC Score: 0.9112774048246117
Test F1 Score: 0.9109164386656291


In [51]:
#  Let's learn which words have highest values of training coefficient 
important_feature_indexes = [i[0] for i in sorted(enumerate(np.abs(lr.coef_[0])), key=lambda x: x[1], reverse=True)]
important_features = vectorizer.get_feature_names_out()[important_feature_indexes]

In [54]:
# Most important 50 words
important_features[:50]

array(['favourite', 'fuck', 'photos', 'allegedly', 'call', 'huffpost',
       'shit', 'norris', 'recipes', 'fucking', 'reportedly', 'experts',
       'video', 'rescued', 'yo', 'heres', 'photo', 'ways', 'cuff',
       'dyslexic', 'walks', 'snl', 'joke', 'tiers', 'opposite', 'reasons',
       'reveals', 'viagra', 'knock', 'mexicans', 'cross', 'infographic',
       'toupee', 'lightbulb', 'erection', 'til', 'redneck', 'obamacare',
       'rjokes', 'midgets', 'samantha', 'queer', 'cows', 'recently',
       'health', 'alleged', 'proves', 'diarrhea', 'feds', 'festival'],
      dtype=object)