In [2]:
import pandas as pd
import random
import vocab_helpers as helper
import punctuation as punc
import re, emoji
import nltk
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import ngrams

Data source---https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection --
Dataset author--- Rishabh Misra
Short description about Dataset: 
            News headlines Sarcasm dataset was developed from two news websites 'The Onion' and 'Huffpost'. First site provides the sarcastic or satirical way of current news and whereas the second provides non sarcastic news headlines.

In [3]:
#--------Inputing the file---
news = pd.read_json('news.json',lines= True)
news = news.rename(columns={'headline':'comment','is_sarcastic':'label'})
news = news[['label','comment']]
news = news.sample(n = 26709, replace = "False",random_state=2)
sample = news
sample.dropna(inplace=True)                         #--------removing NULL values in dataset if any---
sample.reset_index(drop=True, inplace=True)         #--------resetting the index after removing NULL values---

In [4]:
#--------Checking Class Balance in the dataset using crosstab-------
pd.crosstab(sample['label'], columns= "count")

col_0,count
label,Unnamed: 1_level_1
0,15072
1,11637


In [5]:
#---------Removing Numbers from the text and leaving all other features like capital words, punctuation marks etc as it plays role in sarcasm detection-------
sample['no_numb'] = sample.comment.str.replace('[0-9]','')

In [6]:
#--------Tokenizing the words for POS tag extration-----------------
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag
sample['tokenized'] = sample['comment'].apply(word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Darshan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Darshan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
#---------Counting different POS tags present in the sentences---------

#----Adjective Count---
sample['adjective'] = sample['tokenized'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("JJ","JJR", "JJS")]))

#----Adverbs Count------
sample['adverb'] = sample['tokenized'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("RB", "RBR", "RBS")]))

In [8]:
#------Extrating Positive and Negative score from VaderSentiment module----
#------Refered from http://t-redactyl.io/blog/2017/04/applying-sentiment-analysis-with-vader-and-the-twitter-api.html

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

pos = []
neg = []

for i in range(0, len(sample)):
    pos.append(analyzer.polarity_scores(sample.comment[i])['pos'])
    neg.append(analyzer.polarity_scores(sample.comment[i])['neg'])

sample['positive_score'] = pos
sample['negative_score'] = neg

In [9]:
#------Function to Count Intensifier and Capital words-----
#------Refered from https://github.com/MirunaPislar/Sarcasm-Detection

def get_pragmatic_features(text_tokens):
    capitalized_words = intensifiers =tweet_len_ch= 0
    for t in text_tokens:
        tweet_len_ch += len(t)
        if t.isupper() and len(t) > 1:
            capitalized_words += 1       # count of capitalized words
        if t in helper.strong_negations:
            intensifiers += 1           # count-based feature of strong negations
        if t in helper.strong_affirmatives:
            intensifiers += 1           # count-based feature of strong affirmatives
        if t in helper.interjections:
            intensifiers += 1           # count-based feature of relevant interjections
        if t in helper.intensifiers:
            intensifiers += 1           # count-based feature of relevant intensifiers
    feature_list = {'capitalized': capitalized_words,'intensifiers': intensifiers}
    return feature_list

In [10]:
#-------Function to count number of punctuation marks used----
def get_punc_features(text_tokens):
    pat = r'[.?\",!*]+'
    punc_param = 0
    for t in text_tokens:
        pattern = re.findall(pat,t)
        for i in pattern:
            punc_param += 1           # count-based feature of strong negations
    punc_list = {'punc_param':punc_param}
    return punc_list

#-------Function to count Emoji-----------
def get_emoji_features(text_tokens):
    emo_pat = r'[:)\(]+'
    emoji_param = 0
    for t in text_tokens:
        pattern_e = re.findall(emo_pat,t)
        for i in pattern_e:
            emoji_param += 1           # count-based feature of strong negations
    emoji_list = {'emoji_param':emoji_param}
    return emoji_list

In [11]:
#-------Function to count Comparison words-----------

import comparison as comp
def get_like_features(text_tokens):
    like_param = 0
    for t in text_tokens:
        if t in comp.comparison:
            like_param += 1           # count-based feature of strong negations
    like_list = {'like_param':like_param}
    return like_list

In [12]:
#--------Calling functions for Capital, Intensifier, Comparison, Punctuation and Emoji-------

cap = []
intensifier = []
vs_comp = []
punc = []
emoji = []

for i in range(0, len(sample)):
        cap.append(get_pragmatic_features(sample.tokenized[i])['capitalized'])
        intensifier.append(get_pragmatic_features(sample.tokenized[i])['intensifiers'])
        vs_comp.append(get_like_features(sample.tokenized[i])['like_param'])
        punc.append(get_punc_features(sample.tokenized[i])['punc_param'])
        emoji.append(get_emoji_features(sample.tokenized[i])['emoji_param'])
    
sample['capital'] = cap
sample['intensifier'] = intensifier
sample['like_param'] = vs_comp
sample['punctuation'] = punc
sample['emoji'] = emoji


In [13]:
#---- Checking the presence of unigrams-------

from sklearn.feature_extraction.text import CountVectorizer
vectorizer1 = CountVectorizer(ngram_range=(1,1), max_features = 100, max_df=1.0, min_df=0.0)
count_vectors = vectorizer1.fit_transform(list(sample['no_numb']))

# reshape to pandas
from scipy import sparse
vectors = pd.DataFrame(count_vectors.todense())
vectors.columns = vectorizer1.get_feature_names()
unigram_features = vectorizer1.get_feature_names()
sample = pd.concat([sample.reset_index(drop=True),vectors.reset_index(drop=True)], axis=1)

In [14]:
#---- Checking the presence of bigrams-------

from sklearn.feature_extraction.text import CountVectorizer
vectorizer2 = CountVectorizer(ngram_range=(2,2), max_features = 50, max_df=1.0, min_df=0.0)
count_vectors2 = vectorizer2.fit_transform(list(sample['no_numb']))

# reshape to pandas
from scipy import sparse
vectors2 = pd.DataFrame(count_vectors2.todense())
vectors2.columns = vectorizer2.get_feature_names()
bigram_features = vectorizer2.get_feature_names()
sample = pd.concat([sample.reset_index(drop=True),vectors2.reset_index(drop=True)], axis=1)

In [15]:
model_input = sample[['label', 'adjective', 'adverb',
       'positive_score', 'negative_score', 'capital', 'intensifier',
       'like_param', 'punctuation', 'emoji']+unigram_features
                     +bigram_features]

In [16]:
from sklearn.model_selection import train_test_split  

#-------Separating Labels and Independent columns--------
X = model_input.drop('label', axis=1)  
y = model_input['label']  

#-------Dividing input into training and testing set------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [22]:
#-----------Parameter Tuning using RandomizedSearchCV-------------
#-----------refered from https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html--

from sklearn.svm import SVC  
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

clf_r= SVC()  
param_dist_r = {"C": [1,2,3,4,5,6,7,8,9,10],
              "kernel": ["rbf"],
              "gamma":[0.1,0.2,0.3,0.4,0.5,0.6]
              }

# run randomized search
n_iter_search = 2
random_search_r = RandomizedSearchCV(clf_r, param_distributions=param_dist_r,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False)

random_result_rbf = random_search_r.fit(X_train, y_train)

In [23]:
from sklearn.svm import SVC  
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

clf_l = SVC()  
param_dist_l = {"C": [1,2,3,4,5,6,7,8,9,10],
              "kernel": ["linear"]
              }

# run randomized search
n_iter_search = 2
random_search_l = RandomizedSearchCV(clf_l, param_distributions=param_dist_l,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False)

random_result_linear = random_search_l.fit(X_train, y_train)

In [24]:
print(random_result_rbf.best_params_)
print(random_result_rbf.best_score_)
print(random_result_linear.best_params_)
print(random_result_linear.best_score_)

{'kernel': 'rbf', 'gamma': 0.2, 'C': 8}
0.8372522449168773
{'kernel': 'linear', 'C': 3}
0.7701060712400329


In [16]:
#-------Modeling SVC with rbf kernel-------
from sklearn.svm import SVC  
from sklearn.metrics import classification_report

svclassifier = SVC(kernel='rbf',C=8,gamma=0.2, random_state= 22)  
rbf_model_s = svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)

#-------Predicting Accuracy-----
from sklearn.metrics import accuracy_score
print('Accuracy_score:',accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy_score: 0.8361784965558551
              precision    recall  f1-score   support

           0       0.88      0.82      0.85      3737
           1       0.79      0.85      0.82      2941

   micro avg       0.84      0.84      0.84      6678
   macro avg       0.83      0.84      0.84      6678
weighted avg       0.84      0.84      0.84      6678



In [18]:
#-------Modeling SVC with linear kernel-------
from sklearn.svm import SVC  

svclassifier = SVC(kernel='linear',C=3, random_state= 22)  
linear_model_s = svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)

#-------Predicting Accuracy-----
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy_score:',accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy_score: 0.7680443246480982
              precision    recall  f1-score   support

           0       0.82      0.75      0.78      3737
           1       0.71      0.79      0.75      2941

   micro avg       0.77      0.77      0.77      6678
   macro avg       0.77      0.77      0.77      6678
weighted avg       0.77      0.77      0.77      6678



In [22]:
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(rbf_model_s, 'rbf_model_s.pkl') 

joblib.dump(linear_model_s, 'linear_model_s.pkl') 


# Load the pickle file
#clf_load = joblib.load('svc_model_mf.pkl')

['linear_model_s.pkl']

In [31]:
from sklearn.svm import SVC  
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

rfclassifier = RandomForestClassifier()  
param_dist_rf = {'n_estimators':[100,400,600,1000,1200],'max_features':['sqrt'],'criterion':['gini','entropy'], 
               'max_depth' :[10,20,50,100,150] }

# run randomized search
n_iter_search = 2
random_search_rscv = RandomizedSearchCV(rfclassifier, param_distributions=param_dist_rf,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False)

random_result_rf = random_search_rscv.fit(X_train, y_train)

In [32]:
print(random_result_rf.best_params_)
print(random_result_rf.best_score_)

{'n_estimators': 100, 'max_features': 'sqrt', 'max_depth': 100, 'criterion': 'entropy'}
0.877989055847225


In [19]:
#--------Modelling Random Forest------

from sklearn.ensemble import RandomForestClassifier

# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100, bootstrap = True,max_features = 'sqrt',criterion='entropy',max_depth=100, random_state = 22)
# Fit on training data
rf_model = model.fit(X_train,y_train)

rf_predictions = model.predict(X_test)
print('Accuracy_score:',accuracy_score(y_test,rf_predictions))

Accuracy_score: 0.8679245283018868


In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_test,rf_predictions))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88      3737
           1       0.85      0.85      0.85      2941

   micro avg       0.87      0.87      0.87      6678
   macro avg       0.87      0.87      0.87      6678
weighted avg       0.87      0.87      0.87      6678



In [23]:
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(rf_model, 'rf_model_s.pkl')

['rf_model_s.pkl']

In [64]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.linear_model import LogisticRegression


lrclassifier1 = LogisticRegression()  
param_dist_lr1 = {'max_iter':[20,60,100,150,200,300,400,500],'solver':['newton-cg', 'sag','lbfgs'],'penalty':['l2']}

# run randomized search
n_iter_search = 2
random_search_lrcv1 = RandomizedSearchCV(lrclassifier1, param_distributions=param_dist_lr1,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False, n_jobs= -1, random_state= 22)

random_result_lr1 = random_search_lrcv1.fit(X_train, y_train)

In [65]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.linear_model import LogisticRegression


lrclassifier2 = LogisticRegression()  
param_dist_lr2 = {'max_iter':[20,60,100,150,200,300,400,500],'solver':['liblinear', 'saga'],'penalty':['l1']}

# run randomized search
n_iter_search = 2
random_search_lrcv2 = RandomizedSearchCV(lrclassifier2, param_distributions=param_dist_lr2,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False, n_jobs= -1, random_state= 22)

random_result_lr2 = random_search_lrcv2.fit(X_train, y_train)



In [66]:
print(random_result_lr1.best_params_)
print(random_result_lr1.best_score_)
print(random_result_lr2.best_params_)
print(random_result_lr2.best_score_)

{'solver': 'newton-cg', 'penalty': 'l2', 'max_iter': 150}
0.7707050978311668
{'solver': 'saga', 'penalty': 'l1', 'max_iter': 60}
0.7366578906284945


In [21]:
#--------Modelling Logistic Regression------
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='newton-cg', penalty='l2',max_iter=150, random_state= 223)

lr_model = classifier.fit(X_train, y_train)

lr_predictions = classifier.predict(X_test)
print('Accuracy_score:',accuracy_score(y_test,lr_predictions))
print(classification_report(y_test,lr_predictions))

Accuracy_score: 0.77088948787062
              precision    recall  f1-score   support

           0       0.79      0.81      0.80      3737
           1       0.75      0.72      0.74      2941

   micro avg       0.77      0.77      0.77      6678
   macro avg       0.77      0.77      0.77      6678
weighted avg       0.77      0.77      0.77      6678

