In [12]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nisha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
np.random.seed(1111)

In [14]:
Corpus = pd.read_csv('train.csv',encoding='latin-1')

In [15]:
# Step - a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)
print(Corpus['text'])
#Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['text'] = [entry.lower() for entry in Corpus['text']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

0       TENNESSEE: We're the best state. Nobody even c...
1       A man inserted an advertisement in the classif...
2       How many men does it take to open a can of bee...
3       Told my mom I hit 1200 Twitter followers. She ...
4       Roses are dead. Love is fake. Weddings are bas...
                              ...                        
7995    Lack of awareness of the pervasiveness of raci...
7996      Why are aspirins white? Because they work sorry
7997    Today, we Americans celebrate our independence...
7998    How to keep the flies off the bride at an Ital...
7999    "Each ounce of sunflower seeds gives you 37% o...
Name: text, Length: 8000, dtype: object


In [16]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['is_humor'],test_size=0.3)

In [17]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [18]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [19]:
print(Tfidf_vect.vocabulary_)



In [20]:
print(Train_X_Tfidf)

  (0, 4621)	0.37939574757945677
  (0, 3861)	0.1902926961319235
  (0, 3013)	0.3130501195053941
  (0, 2948)	0.20553054543252014
  (0, 2331)	0.3547108071566338
  (0, 1886)	0.16237285677869215
  (0, 1796)	0.23320529720400054
  (0, 1680)	0.3084026129339185
  (0, 1570)	0.31824558982749407
  (0, 1553)	0.3615104729804161
  (0, 1303)	0.24254984489979237
  (0, 18)	0.29516484490635336
  (1, 4219)	0.4291981641103078
  (1, 3997)	0.3780235312329958
  (1, 3768)	0.3472266585623229
  (1, 3186)	0.4968618667337591
  (1, 2807)	0.40014738148320683
  (1, 1856)	0.20784341162864095
  (1, 1521)	0.31958139505087907
  (2, 4841)	0.5294267194308631
  (2, 4334)	0.6404124857098099
  (2, 2895)	0.3699867586794394
  (2, 2574)	0.41555865446593576
  (3, 4968)	0.12318433111725523
  (3, 4560)	0.1947006478500387
  :	:
  (5597, 4725)	0.4832889214454998
  (5597, 4394)	0.2841864600391989
  (5597, 3074)	0.25503415180640715
  (5597, 2468)	0.2671854471279554
  (5597, 2005)	0.3638435241405786
  (5597, 1687)	0.3246213983413133
  (5

In [21]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  81.45833333333333


In [22]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  83.45833333333333


In [23]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Corpus['is_humor'])

In [25]:
Corpus_1 = pd.read_csv('public_dev.csv',encoding='latin-1')

In [27]:
# Step - a : Remove blank rows if any.
Corpus_1['text'].dropna(inplace=True)
print(Corpus_1['text'])
#Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus_1['text'] = [entry.lower() for entry in Corpus_1['text']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus_1['text']= [word_tokenize(entry) for entry in Corpus_1['text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus_1['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus_1.loc[index,'text_final'] = str(Final_words)

0      What's the difference between a Bernie Sanders...
1         Vodka, whisky, tequila. I'm calling the shots.
2         French people don't masturbate They Jacque off
3      A lot of Suicide bombers are Muslims - I don't...
4      What happens when you fingerbang a gypsy on he...
                             ...                        
995    boss: what are you doing inventor of the bagpi...
996    I told him his views were pretty extreme and i...
997    "Mum, all the black kids call each other Nigga...
998    In honor of Fathers Day, I'm gonna bring you "...
999    I don't know why Coca-Cola and Pepsi are fight...
Name: text, Length: 1000, dtype: object


In [28]:
Tfidf_vect = TfidfVectorizer(max_features=3223)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Corpus['text_final'])
Test_X_Tfidf = Tfidf_vect.transform(Corpus_1['text_final'])

In [30]:
print(Tfidf_vect.vocabulary_)



In [31]:
print(Train_X_Tfidf)

  (0, 3093)	0.2064065302698588
  (0, 2860)	0.677988157858026
  (0, 2710)	0.24587361230685795
  (0, 2552)	0.24888316665521046
  (0, 2427)	0.24587361230685795
  (0, 1987)	0.23343348029991481
  (0, 1947)	0.27678073585623253
  (0, 972)	0.20725358634478755
  (0, 567)	0.1847219674803227
  (0, 527)	0.253852710979787
  (0, 265)	0.20557647312825789
  (1, 3155)	0.16717947467163988
  (1, 3100)	0.17685464858374217
  (1, 2721)	0.28640072818759493
  (1, 2370)	0.235207379145334
  (1, 2323)	0.2930345506951854
  (1, 2315)	0.31623008328758023
  (1, 1934)	0.2361532339819564
  (1, 1823)	0.2930345506951854
  (1, 1733)	0.19218681717224548
  (1, 1141)	0.25082489464940155
  (1, 848)	0.24635803464649644
  (1, 761)	0.3228639057951707
  (1, 733)	0.18023902903967956
  (1, 181)	0.352693260895156
  :	:
  (7995, 157)	0.2034004771459316
  (7996, 3187)	0.5009223623873094
  (7996, 3148)	0.5815030740616536
  (7996, 2654)	0.6410389705136218
  (7997, 2924)	0.2776486007943055
  (7997, 2128)	0.3914938901336313
  (7997, 1437

In [32]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
print(predictions_NB)

[1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 1 0 0 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1
 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 0 0 1
 0 0 1 1 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 0 0 0 1 1
 1 1 0 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 0 1 1 1
 1 1 1 0 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0
 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 0 0 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1
 1 1 0 0 1 1 1 1 1 1 1 1 

In [33]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
print(predictions_SVM)

[1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 1 0 0 1 0 1 1 0 0 0 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1
 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 0 1 1
 0 1 1 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 0 1 1 0 0 1
 1 0 1 1 1 0 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1
 1 1 0 0 1 1 1 1 0 1 0 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1
 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 1 0
 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 1 1 0 1
 0 1 0 1 0 1 1 0 1 1 0 1 1 1 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 0 1 0 0 0 1 0 1
 0 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 1 0 1 1 1 1 1 0 1
 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 0 1 1 0 1 1 1 0 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 0 0 0 1 0 1 1 1 1 1
 1 1 0 0 1 1 1 1 1 1 1 1 