In [1]:
import pandas as pd 
import datetime
import numpy as np
import json
from pandas.io.json import json_normalize

from nltk.corpus import stopwords

import cufflinks as cf
cf.set_config_file(offline = True)

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
import sklearn.feature_extraction
from sklearn.pipeline import Pipeline

import string  
import seaborn as sns
sns.set(style="white")

In [2]:
data = [json.loads(line) for line in open('news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json', 'r')]
data = json_normalize(data)
data = data.drop(['article_link'], axis=1)

In [3]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [4]:
bow_transformer = CountVectorizer(analyzer=text_process)
bow_transformer.fit(data['headline'].values)

messages_bow = bow_transformer.transform(data['headline'].values)

In [5]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf_final = tfidf_transformer.transform(messages_bow)

## Naive Bayes Approach

In [6]:
spam_detect_model = MultinomialNB().fit(tfidf_final,data['is_sarcastic'])
all_predictions = spam_detect_model.predict(tfidf_final)

print (classification_report(data['is_sarcastic'], all_predictions))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92     14985
           1       0.96      0.82      0.89     11724

    accuracy                           0.91     26709
   macro avg       0.92      0.90      0.90     26709
weighted avg       0.91      0.91      0.91     26709



In [8]:
x_train, x_test, y_train, y_test = train_test_split(data['headline'], data['is_sarcastic'], test_size=0.2)
print (len(x_train), len(x_test), len(y_train) + len(y_test))

21367 5342 26709


## Logistic Regression Approach

In [9]:
pipeline = Pipeline([('bow',CountVectorizer(analyzer =text_process)),
                    ('tfidf',TfidfTransformer()),
                    ('classifier',LogisticRegression())])

In [10]:
pipeline.fit(x_train,y_train)





Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x7f5e44161730>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                            

In [11]:
predictions = pipeline.predict(x_test)

In [12]:
print (classification_report(predictions,y_test))

              precision    recall  f1-score   support

           0       0.87      0.79      0.83      3299
           1       0.71      0.82      0.76      2043

    accuracy                           0.80      5342
   macro avg       0.79      0.80      0.79      5342
weighted avg       0.81      0.80      0.80      5342

