In [1]:
import pandas as pd, numpy as np, re, time
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [6]:
data = pd.read_csv('train.csv')

#data = dataset[['text', 'is_humor']]

print(data.head())
print('-------------------------------------------------------------------------')
print(data.isnull().any(axis = 0))

   id                                               text  is_humor  \
0   1  TENNESSEE: We're the best state. Nobody even c...         1   
1   2  A man inserted an advertisement in the classif...         1   
2   3  How many men does it take to open a can of bee...         1   
3   4  Told my mom I hit 1200 Twitter followers. She ...         1   
4   5  Roses are dead. Love is fake. Weddings are bas...         1   

   humor_rating  humor_controversy  offense_rating  
0          2.42                1.0             0.2  
1          2.50                1.0             1.1  
2          1.95                0.0             2.4  
3          2.11                1.0             0.0  
4          2.78                0.0             0.1  
-------------------------------------------------------------------------
id                   False
text                 False
is_humor             False
humor_rating          True
humor_controversy     True
offense_rating       False
dtype: bool


In [7]:
# Relacing special symbols and digits in headline column
# re stands for Regular Expression
data['text'] = data['text'].apply(lambda s : re.sub('[^a-zA-Z]', ' ', s))

In [8]:
print(data.head())
print('-------------------------------------------------------------------------')
print(data.isnull().any(axis = 0))

# getting features and labels
features = data['text']
labels = data['is_humor']

   id                                               text  is_humor  \
0   1  TENNESSEE  We re the best state  Nobody even c...         1   
1   2  A man inserted an advertisement in the classif...         1   
2   3  How many men does it take to open a can of bee...         1   
3   4  Told my mom I hit      Twitter followers  She ...         1   
4   5  Roses are dead  Love is fake  Weddings are bas...         1   

   humor_rating  humor_controversy  offense_rating  
0          2.42                1.0             0.2  
1          2.50                1.0             1.1  
2          1.95                0.0             2.4  
3          2.11                1.0             0.0  
4          2.78                0.0             0.1  
-------------------------------------------------------------------------
id                   False
text                 False
is_humor             False
humor_rating          True
humor_controversy     True
offense_rating       False
dtype: bool


In [9]:
# Stemming our data
ps = PorterStemmer()
features = features.apply(lambda x: x.split())

features = features.apply(lambda x : ' '.join([ps.stem(word) for word in x]))
print(features)

0       tennesse We re the best state nobodi even come...
1       A man insert an advertis in the classifi wife ...
2       how mani men doe it take to open a can of beer...
3       told my mom I hit twitter follow she point out...
4       rose are dead love is fake wed are basic funer...
                              ...                        
7995    lack of awar of the pervas of racism in our so...
7996         whi are aspirin white becaus they work sorri
7997    today we american celebr our independ from bri...
7998    how to keep the fli off the bride at an italia...
7999    each ounc of sunflow seed give you of your dai...
Name: text, Length: 8000, dtype: object


In [10]:
# vectorizing the data with maximum of 5000 features
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features = 3223)
features = list(features)
features = tv.fit_transform(features).toarray()
print(features)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
# getting training and testing data
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.2, random_state = 0)

In [15]:
# model 1:-
# Using linear support vector classifier
lsvc = LinearSVC()
# training the model
lsvc.fit(features_train, labels_train)
#getting the score of train and test data
print(lsvc.score(features_train, labels_train)) # 90.93
print(lsvc.score(features_test, labels_test))   # 83.75
# model 2:-
# Using Gaussuan Naive Bayes
# gnb = GaussianNB()
# gnb.fit(features_train, labels_train)
# print(gnb.score(features_train, labels_train))  # 78.86
# print(gnb.score(features_test, labels_test))    # 73.80
# # model 3:-
# # Logistic Regression
# lr = LogisticRegression()
# lr.fit(features_train, labels_train)
# print(lr.score(features_train, labels_train))   # 88.16
# print(lr.score(features_test, labels_test))     # 83.08
# # model 4:-
# # Random Forest Classifier
#rfc = RandomForestClassifier(n_estimators = 10, random_state = 0)
#rfc.fit(features_train, labels_train)
#print(rfc.score(features_train, labels_train))  # 98.82
#print(rfc.score(features_test, labels_test))    # 79.71

0.96328125
0.851875


In [16]:
eval_data = pd.read_csv('public_dev.csv')
print(eval_data)

       id                                               text  is_humor
0    8001  What's the difference between a Bernie Sanders...         1
1    8002     Vodka, whisky, tequila. I'm calling the shots.         0
2    8003     French people don't masturbate They Jacque off         1
3    8004  A lot of Suicide bombers are Muslims - I don't...         0
4    8005  What happens when you fingerbang a gypsy on he...         0
..    ...                                                ...       ...
995  8996  boss: what are you doing inventor of the bagpi...         0
996  8997  I told him his views were pretty extreme and i...         1
997  8998  "Mum, all the black kids call each other Nigga...         0
998  8999  In honor of Fathers Day, I'm gonna bring you "...         1
999  9000  I don't know why Coca-Cola and Pepsi are fight...         1

[1000 rows x 3 columns]


In [17]:
eval_df = eval_data['text'].apply(lambda s : re.sub('[^a-zA-Z]', ' ', s))
print(eval_df)
print("------------------------------------------------------------------")
eval_df_features = eval_df.apply(lambda x: x.split())

eval_df_features = eval_df_features.apply(lambda x : ' '.join([ps.stem(word) for word in x]))
print(eval_df_features)

0      What s the difference between a Bernie Sanders...
1         Vodka  whisky  tequila  I m calling the shots 
2         French people don t masturbate They Jacque off
3      A lot of Suicide bombers are Muslims   I don t...
4      What happens when you fingerbang a gypsy on he...
                             ...                        
995    boss  what are you doing inventor of the bagpi...
996    I told him his views were pretty extreme and i...
997     Mum  all the black kids call each other Nigga...
998    In honor of Fathers Day  I m gonna bring you  ...
999    I don t know why Coca Cola and Pepsi are fight...
Name: text, Length: 1000, dtype: object
------------------------------------------------------------------
0      what s the differ between a berni sander suppo...
1                 vodka whiski tequila I m call the shot
2              french peopl don t masturb they jacqu off
3      A lot of suicid bomber are muslim I don t blam...
4      what happen when you fingerbang

In [18]:
eval_df_features = list(eval_df_features)
tv1 = TfidfVectorizer(max_features = 3223)

eval_df_features = tv1.fit_transform(eval_df_features).toarray()
print(eval_df_features)
print(len(eval_df_features))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
1000


In [19]:
# eval_df_features=tv1.transform(eval_df_features)
#classifier.predict(X_test)
humor_predict = lsvc.predict(eval_df_features)

In [20]:
print(humor_predict)

[1 0 1 0 0 0 1 0 1 1 0 1 0 1 1 1 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 1 0 0 1 1
 0 1 0 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 0 1 1 1 1 0 1 0 1 0 1 0 0 1 1 1 0 0 1
 1 1 1 1 0 1 1 0 1 0 0 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 0 1 1 1 1 0 0 1
 1 1 0 1 1 0 0 1 0 0 1 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 0 0 0 0 1 0 0 1
 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1 1 1 1 0 0 0 0 1 1 0 1 1 1 1
 0 0 1 0 1 0 0 1 0 1 0 1 0 0 1 1 0 0 1 1 0 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 0 0 1 0 1 1 1 1 1 0 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1 0 1
 1 1 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1
 0 1 0 1 0 1 0 1 0 1 1 1 1 1 0 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1
 1 1 1 1 1 0 1 1 0 1 0 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 1 1 1 0 0 1 0 0 1 1 1
 1 1 1 0 0 1 1 0 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 1 1 1 1 1 1 0 1 0 1 0 0
 1 0 0 0 0 1 1 1 0 1 0 1 1 1 1 0 1 0 1 1 0 0 0 1 1 1 0 0 1 0 1 1 1 1 0 1 0
 0 0 0 1 1 0 1 0 1 0 1 1 0 1 1 1 0 0 1 0 1 0 1 1 1 0 1 1 1 0 1 1 0 0 1 0 1
 1 0 1 0 0 1 0 1 1 0 1 0 