In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_json('sarcasm.json')

In [3]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [4]:
df.shape, df['headline'].iloc[76], df['is_sarcastic'].iloc[76]

((26709, 3), 'breast implants found to cause problems in laboratory mice', 1)

In [5]:
df['is_sarcastic'].value_counts()

0    14985
1    11724
Name: is_sarcastic, dtype: int64

In [6]:
# Firstly remove the symbols from headlines
df['headline'] = df['headline'].str.replace('[^\w\s]', '')

  df['headline'] = df['headline'].str.replace('[^\w\s]', '')


In [28]:
# I think word is already in lower case but let's do once more
df['headline'] = df['headline'].apply(lambda x : " ".join([w.lower() for w in x.split()]))

In [20]:
# Removing the stopwords
from nltk.corpus import stopwords

In [21]:
stopword = stopwords.words('english')

In [26]:
stopword

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [29]:
def remove_stopwords(sentence):
    word_list = []
    for word in sentence.split():
        if word not in stopword:
            word_list.append(word)
    
    return " ".join(word_list)

In [33]:
remove_stopwords(df['headline'].iloc[887])

'group friends chanting shots make compelling point'

In [35]:
df['headline'] = df['headline'].apply(lambda x: remove_stopwords(x))

In [39]:
# Lematize the words
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem(sentence):
    word_list = []
    for word in sentence.split():
        word_list.append(stemmer.stem(word))
    return " ".join(word_list)

In [42]:
stem(df['headline'].iloc[31])

'gillian jacob like kiss adam brodi'

In [44]:
df['headline'] = df['headline'].apply(lambda x: stem(x))

In [45]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versac store clerk sue secret black cod...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,roseann reviv catch thorni polit mood better wors,0
2,https://local.theonion.com/mom-starting-to-fea...,mom start fear son web seri closest thing gran...,1
3,https://politics.theonion.com/boehner-just-wan...,boehner want wife listen come altern debtreduc...,1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,jk rowl wish snape happi birthday magic way,0


In [46]:
## Featuer engineering
X = df['headline']
y = df['is_sarcastic']

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=50)

In [50]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Countvectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)


# TFidf Vectorizer
tf = TfidfVectorizer()
X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score

models = { 'Logistic Regression':LogisticRegression(),'Decision Tree':DecisionTreeClassifier(),
         'Random Forest': RandomForestClassifier(), 'SVM':SVC(), 'KNN':KNeighborsClassifier(),
         'GaussianNB':GaussianNB(),'MultinomialNB': MultinomialNB()}

def test_model():
    best_score = 0
    best = None
    
    for name, model in models.items():
        print("Count Vectorizer")
        model.fit(X_train_cv.toarray(), y_train)
        pred = model.predict(X_test_cv.toarray())
        acc = accuracy_score(pred, y_test)
        print("{} : {}".format(name, acc))
        print("TFid Vectorizer")
        
        if acc > best_score:
            best_score = acc
            best = (name, 'CV', acc)
        
        model.fit(X_train_tf.toarray(), y_train)
        pred = model.predict(X_test_tf.toarray())
        acc = accuracy_score(pred, y_test)
        print("{} : {}".format(name, acc))
        
        if acc > best_score:
            best_score = acc
            best = (name, 'Tfidf', acc)
        print("\n----------------------------------------------------\n")
    print("Best: ",best)

In [None]:
test_model()

Count Vectorizer
Logistic Regression : 0.7873455634593786
TFid Vectorizer
Logistic Regression : 0.781604892050418

----------------------------------------------------

Count Vectorizer
