In [2]:
import pandas as pd
import numpy as np
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec,TaggedDocument 
import string as string

Data source---https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection --
Dataset author--- Rishabh Misra
Short description about Dataset: 
            News headlines Sarcasm dataset was developed from two news websites 'The Onion' and 'Huffpost'. First site provides the sarcastic or satirical way of current news and whereas the second provides non sarcastic news headlines.

In [3]:
#--------Inputing the file---
news = pd.read_json('news.json',lines= True)
news = news.rename(columns={'headline':'comment','is_sarcastic':'label'})
news = news[['label','comment']]
news = news.sample(n = 26709, replace = "False",random_state=2)
sample = news
sample.dropna(inplace=True)                         #--------removing NULL values in dataset if any---
sample.reset_index(drop=True, inplace=True)         #--------resetting the index after removing NULL values---

#--------Checking Class Balance in the dataset using crosstab-------
pd.crosstab(sample['label'],len(sample['label']))

col_0,26709
label,Unnamed: 1_level_1
0,15072
1,11637


In [4]:
#---------Removing Numbers from the text and leaving all other features like capital words, punctuation marks etc as it plays role in sarcasm detection-------
sample['no_numb'] = sample.comment.str.replace('[0-9]','')

In [5]:
#----Function to create Paragraph vector--------
#----Refered https://gist.github.com/susanli2016/801918d5ff37f52b55cce0a46088706c

from gensim.models import doc2vec

def tag_text(corpus, label_type):
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

In [6]:
#-------Spliting the data to train and test also assigning tags to each row----------

from sklearn.model_selection import train_test_split  

X_train, X_test, y_train, y_test = train_test_split(sample.no_numb, sample.label, random_state=2, test_size=0.25)
X_train = tag_text(X_train, 'Train')
X_test = tag_text(X_test, 'Test')
all_text = X_train + X_test

In [7]:
#-------Building Paragraph vector based on Distributed Bag of words(PV-DBOW) model---

from tqdm import tqdm
dbow_method = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065, window = 11)
dbow_method.build_vocab([x for x in tqdm(all_text)])

100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 1979391.93it/s]


In [8]:
#------Training the PV-DBOW model-----------
from sklearn import utils

for epoch in range(30):
    dbow_method.train(utils.shuffle([x for x in tqdm(all_text)]), total_examples=len(all_text), epochs=1)
    dbow_method.alpha -= 0.002
    dbow_method.min_alpha = dbow_method.alpha

100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 2213595.98it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 1536092.17it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 1978517.96it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 1604975.22it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 1477911.15it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 1264155.47it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 1391277.52it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 1505822.51it/s]
100%|███████████████████████████████████

In [8]:
#-----Function to extract Vectors-----------
#-----Refered https://github.com/susanli2016/NLP-with-Python

def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [9]:
train_dbow = get_vectors(dbow_method, len(X_train), 300, 'Train')
test_dbow = get_vectors(dbow_method, len(X_test), 300, 'Test')

In [10]:
#-----------Parameter Tuning using RandomizedSearchCV-------------
#-----------refered from https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html--


from sklearn.svm import SVC  
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

clf_r= SVC()  
param_dist_r = {"C": [1,2,3,4,5,6,7,8,9,10],
              "kernel": ["rbf"],
              "gamma":[0.1,0.2,0.3,0.4,0.5,0.6]
              }

# run randomized search
n_iter_search = 2
random_search_r = RandomizedSearchCV(clf_r, param_distributions=param_dist_r,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False)

random_result_rbf = random_search_r.fit(train_dbow,y_train)

In [11]:
from sklearn.svm import SVC  
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

clf_l = SVC()  
param_dist_l = {"C": [1,2,3,4,5,6,7,8,9,10],
              "kernel": ["linear"]
              }

# run randomized search
n_iter_search = 2
random_search_l = RandomizedSearchCV(clf_l, param_distributions=param_dist_l,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False)

random_result_linear = random_search_l.fit(train_dbow,y_train)

In [12]:
print(random_result_rbf.best_params_)
print(random_result_rbf.best_score_)
print(random_result_linear.best_params_)
print(random_result_linear.best_score_)

{'kernel': 'rbf', 'gamma': 0.5, 'C': 5}
0.8947369896437749
{'kernel': 'linear', 'C': 3}
0.8126874267906


In [10]:
#-------Modeling SVC-------

from sklearn.svm import SVC 
from sklearn.metrics import classification_report

svclassifier = SVC(kernel='rbf',C=5,gamma=0.5)  
rbf_vector = svclassifier.fit(train_dbow, y_train)
y_pred = svclassifier.predict(test_dbow)

from sklearn.metrics import accuracy_score
print('Accuracy_score:',accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy_score: 0.9126984126984127
              precision    recall  f1-score   support

           0       0.92      0.93      0.92      3766
           1       0.90      0.89      0.90      2912

   micro avg       0.91      0.91      0.91      6678
   macro avg       0.91      0.91      0.91      6678
weighted avg       0.91      0.91      0.91      6678



In [11]:
#-------Modeling SVC-------

from sklearn.svm import SVC 
from sklearn.metrics import classification_report

svclassifier = SVC(kernel='linear',C=3)  
linear_vector = svclassifier.fit(train_dbow, y_train)
y_pred = svclassifier.predict(test_dbow)

from sklearn.metrics import accuracy_score
print('Accuracy_score:',accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy_score: 0.8153638814016172
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      3766
           1       0.80      0.77      0.78      2912

   micro avg       0.82      0.82      0.82      6678
   macro avg       0.81      0.81      0.81      6678
weighted avg       0.81      0.82      0.81      6678



In [19]:
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(rbf_vector, 'rbf_vector_s.pkl') 

#joblib.dump(linear_vector, 'linear_vector_s.pkl') 


['rbf_vector_s.pkl']

In [18]:

from sklearn.svm import SVC  
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier


rfclassifier = RandomForestClassifier()  
param_dist_rf = {'n_estimators':[100,400,600,1000,1200],'max_features':['sqrt'],'criterion':['gini','entropy'], 
               'max_depth' :[10,20,50,100,150] }

# run randomized search
n_iter_search = 2
random_search_rscv = RandomizedSearchCV(rfclassifier, param_distributions=param_dist_rf,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False)

random_result_rf = random_search_rscv.fit(train_dbow,y_train)

print(random_result_rf.best_params_)
print(random_result_rf.best_score_)

{'n_estimators': 1000, 'max_features': 'sqrt', 'max_depth': 50, 'criterion': 'entropy'}
0.869384047037084


In [11]:
#--------Modelling Random Forest------
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Create the model with 1000 trees
model = RandomForestClassifier(n_estimators=1000, bootstrap = True, max_features = 'sqrt', max_depth=50, criterion='entropy')
# Fit on training data
rf_vector = model.fit(train_dbow,y_train)

rf_predictions = model.predict(test_dbow)
print('Accuracy_score:',accuracy_score(y_test,rf_predictions))
print(classification_report(y_test,rf_predictions))

Accuracy_score: 0.8845462713387242
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      3766
           1       0.89      0.84      0.86      2912

   micro avg       0.88      0.88      0.88      6678
   macro avg       0.89      0.88      0.88      6678
weighted avg       0.88      0.88      0.88      6678



In [20]:
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(rf_vector, 'rf_vector_s.pkl')

['rf_vector_s.pkl']

In [22]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.linear_model import LogisticRegression


lrclassifier1 = LogisticRegression()  
param_dist_lr1 = {'max_iter':[20,60,100,150,200,300,400,500],'solver':['newton-cg', 'sag','lbfgs'],'penalty':['l2']}

# run randomized search
n_iter_search = 2
random_search_lrcv1 = RandomizedSearchCV(lrclassifier1, param_distributions=param_dist_lr1,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False, n_jobs= -1, random_state= 22)

random_result_lr1 = random_search_lrcv1.fit(train_dbow,y_train)

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.linear_model import LogisticRegression


lrclassifier2 = LogisticRegression()  
param_dist_lr2 = {'max_iter':[20,60,100,150,200,300,400,500],'solver':['liblinear', 'saga'],'penalty':['l1']}

# run randomized search
n_iter_search = 2
random_search_lrcv2 = RandomizedSearchCV(lrclassifier2, param_distributions=param_dist_lr2,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False, n_jobs= -1, random_state= 22)

random_result_lr2 = random_search_lrcv2.fit(train_dbow,y_train)



In [24]:
print(random_result_lr1.best_params_)
print(random_result_lr1.best_score_)
print(random_result_lr2.best_params_)
print(random_result_lr2.best_score_)

{'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 200}
0.8107084453076618
{'solver': 'saga', 'penalty': 'l1', 'max_iter': 60}
0.8100665767534858


In [12]:
#--------Modelling Logistic Regression------

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',penalty='l2',max_iter=200, random_state=22)
lr_vector = classifier.fit(train_dbow, y_train)

lr_predictions = classifier.predict(test_dbow)
print('Accuracy_score:',accuracy_score(y_test,lr_predictions))
print(classification_report(y_test,lr_predictions))

Accuracy_score: 0.8138664270739743
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      3766
           1       0.80      0.77      0.78      2912

   micro avg       0.81      0.81      0.81      6678
   macro avg       0.81      0.81      0.81      6678
weighted avg       0.81      0.81      0.81      6678



In [21]:
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(lr_vector, 'lr_vector_s.pkl')

['lr_vector_s.pkl']

In [13]:
#-------Building Paragraph vector based on Distributed memory(PV-DM) model---

dm_method = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=8, negative=5,
                   min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
dm_method.build_vocab([x for x in tqdm(all_text)])

100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 3800701.12it/s]


In [14]:
#------Training the PV-DM model-----------
for epoch in range(30):
    dm_method.train(utils.shuffle([x for x in tqdm(all_text)]), total_examples=len(all_text), epochs=1)
    dm_method.alpha -= 0.002
    dm_method.min_alpha = dm_method.alpha

100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 3334593.41it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 1373873.75it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 1795311.87it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 3347547.15it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 4462996.12it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 4486949.39it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 3821184.48it/s]
100%|███████████████████████████████████████████████████████████████████████| 26709/26709 [00:00<00:00, 2200810.69it/s]
100%|███████████████████████████████████

In [15]:
train_dm = get_vectors(dm_method, len(X_train), 300, 'Train')
test_dm = get_vectors(dm_method, len(X_test), 300, 'Test')

In [31]:
from sklearn.svm import SVC  
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

clf_r= SVC()  
param_dist_r = {"C": [1,2,3,4,5,6,7,8,9,10],
              "kernel": ["rbf"],
              "gamma":[0.1,0.2,0.3,0.4,0.5,0.6]
              }

# run randomized search
n_iter_search = 2
random_search_r = RandomizedSearchCV(clf_r, param_distributions=param_dist_r,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False)

random_result_rbf_dm = random_search_r.fit(train_dm,y_train)

In [32]:
from sklearn.svm import SVC  
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

clf_l = SVC()  
param_dist_l = {"C": [1,2,3,4,5,6,7,8,9,10],
              "kernel": ["linear"]
              }

# run randomized search
n_iter_search = 2
random_search_l = RandomizedSearchCV(clf_l, param_distributions=param_dist_l,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False)

random_result_linear_dm = random_search_l.fit(train_dm,y_train)

In [33]:
print(random_result_rbf_dm.best_params_)
print(random_result_rbf_dm.best_score_)
print(random_result_linear_dm.best_params_)
print(random_result_linear_dm.best_score_)

{'kernel': 'rbf', 'gamma': 0.2, 'C': 6}
0.820335944438803
{'kernel': 'linear', 'C': 10}
0.7733743329810224


In [16]:
#-------Modeling SVC-------

from sklearn.svm import SVC 
from sklearn.metrics import classification_report

svclassifier = SVC(kernel='rbf',C=6,gamma=0.2)  
rbf_vector_dm_rbf = svclassifier.fit(train_dm, y_train)
y_pred = svclassifier.predict(test_dm)

from sklearn.metrics import accuracy_score
print('Accuracy_score:',accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy_score: 0.8511530398322851
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      3766
           1       0.84      0.82      0.83      2912

   micro avg       0.85      0.85      0.85      6678
   macro avg       0.85      0.85      0.85      6678
weighted avg       0.85      0.85      0.85      6678



In [18]:
#-------Modeling SVC-------

from sklearn.svm import SVC 
from sklearn.metrics import classification_report

svclassifier = SVC(kernel='linear',C=10)  
rbf_vector_dm_linear = svclassifier.fit(train_dm, y_train)
y_pred = svclassifier.predict(test_dm)

from sklearn.metrics import accuracy_score
print('Accuracy_score:',accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy_score: 0.7791254866726565
              precision    recall  f1-score   support

           0       0.79      0.83      0.81      3766
           1       0.76      0.71      0.74      2912

   micro avg       0.78      0.78      0.78      6678
   macro avg       0.78      0.77      0.77      6678
weighted avg       0.78      0.78      0.78      6678



In [22]:
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(rbf_vector_dm_rbf, 'rbf_vector_s_dm.pkl') 

#joblib.dump(rbf_vector_dm_linear, 'linear_vector_s_dm.pkl') 


['rbf_vector_s_dm.pkl']

In [38]:
from sklearn.svm import SVC  
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier


rfclassifier = RandomForestClassifier()  
param_dist_rf = {'n_estimators':[100,400,600,1000,1200],'max_features':['sqrt'],'criterion':['gini','entropy'], 
               'max_depth' :[10,20,50,100,150] }

# run randomized search
n_iter_search = 2
random_search_rscv = RandomizedSearchCV(rfclassifier, param_distributions=param_dist_rf,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False)

random_result_rf_dm = random_search_rscv.fit(train_dm,y_train)

print(random_result_rf_dm.best_params_)
print(random_result_rf_dm.best_score_)

{'n_estimators': 1000, 'max_features': 'sqrt', 'max_depth': 100, 'criterion': 'gini'}
0.7886181211768425


In [17]:
#--------Modelling Random Forest------
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Create the model with 1000 trees
model = RandomForestClassifier(n_estimators=1000, bootstrap = True, max_features = 'sqrt', max_depth=100, criterion='gini')
# Fit on training data
rf_vector_dm = model.fit(train_dm,y_train)

rf_predictions_dm = model.predict(test_dm)
print('Accuracy_score:',accuracy_score(y_test,rf_predictions_dm))
print(classification_report(y_test,rf_predictions_dm))

Accuracy_score: 0.7999401018268942
              precision    recall  f1-score   support

           0       0.79      0.88      0.83      3766
           1       0.82      0.70      0.75      2912

   micro avg       0.80      0.80      0.80      6678
   macro avg       0.80      0.79      0.79      6678
weighted avg       0.80      0.80      0.80      6678



In [23]:
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(rf_vector_dm, 'rf_vector_s_dm.pkl')

['rf_vector_s_dm.pkl']

In [43]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.linear_model import LogisticRegression


lrclassifier1 = LogisticRegression()  
param_dist_lr1 = {'max_iter':[20,60,100,150,200,300,400,500],'solver':['newton-cg', 'sag','lbfgs'],'penalty':['l2']}

# run randomized search
n_iter_search = 2
random_search_lrcv1 = RandomizedSearchCV(lrclassifier1, param_distributions=param_dist_lr1,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False, n_jobs= -1, random_state= 22)

random_result_lr1_dm = random_search_lrcv1.fit(train_dm,y_train)

In [44]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.linear_model import LogisticRegression


lrclassifier2 = LogisticRegression()  
param_dist_lr2 = {'max_iter':[20,60,100,150,200,300,400,500],'solver':['liblinear', 'saga'],'penalty':['l1']}

# run randomized search
n_iter_search = 2
random_search_lrcv2 = RandomizedSearchCV(lrclassifier2, param_distributions=param_dist_lr2,
                                   n_iter=n_iter_search, scoring= 'accuracy', cv=5, iid=False, n_jobs= -1, random_state= 22)

random_result_lr2_dm = random_search_lrcv2.fit(train_dm,y_train)



In [46]:
print(random_result_lr1_dm.best_params_)
print(random_result_lr1_dm.best_score_)
print(random_result_lr2_dm.best_params_)
print(random_result_lr2_dm.best_score_)

{'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 200}
0.7687211256405599
{'solver': 'saga', 'penalty': 'l1', 'max_iter': 60}
0.7699513582086777


In [18]:
#--------Modelling Logistic Regression------

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='saga',penalty='l1',max_iter=60, random_state=22)
lr_vector_dm = classifier.fit(train_dm, y_train)

lr_predictions_dm = classifier.predict(test_dm)
print('Accuracy_score:',accuracy_score(y_test,lr_predictions_dm))
print(classification_report(y_test,lr_predictions_dm))



Accuracy_score: 0.7707397424378556
              precision    recall  f1-score   support

           0       0.78      0.83      0.80      3766
           1       0.76      0.70      0.73      2912

   micro avg       0.77      0.77      0.77      6678
   macro avg       0.77      0.76      0.76      6678
weighted avg       0.77      0.77      0.77      6678

