In [32]:
import pandas as pd
import numpy as np

In [33]:
data = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines = True)

In [34]:
data.drop(['article_link'], axis=1)

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...
...,...,...
28614,1,jews to celebrate rosh hashasha or something
28615,1,internal affairs investigator disappointed con...
28616,0,the most beautiful acceptance speech this week...
28617,1,mars probe destroyed by orbiting spielberg-gat...


In [35]:
import re, time
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [36]:
data['headline'] = data['headline'].apply(lambda s : re.sub('r[^a-zA-Z]+', '', s))

In [37]:
X = data['headline']
y = data['is_sarcastic']

In [38]:
X

0        thirtysomething scientists unveil doomsday clo...
1        dem rep. totally nails why congress is falling...
2          eat youveggies: 9 deliciously different recipes
3         inclement weatheprevents liafrom getting to work
4        mothecomes pretty close to using word 'streami...
                               ...                        
28614           jews to celebrate rosh hashasha osomething
28615    internal affairs investigatodisappointed consp...
28616    the most beautiful acceptance speech this week...
28617    mars probe destroyed by orbiting spielberg-gat...
28618                   dad clarifies this not a food stop
Name: headline, Length: 28619, dtype: object

In [39]:
ps = PorterStemmer()
X = X.apply(lambda x: x.split())
X = X.apply(lambda x : ' '.join([ps.stem(word) for word in x]))

In [40]:
X

0        thirtysometh scientist unveil doomsday clock o...
1        dem rep. total nail whi congress is fall short...
2                    eat youveggies: 9 delici differ recip
3                 inclement weatheprev liafrom get to work
4        mothecom pretti close to use word 'streaming' ...
                               ...                        
28614                  jew to celebr rosh hashasha osometh
28615    intern affair investigatodisappoint conspiraci...
28616    the most beauti accept speech thi week came fr...
28617    mar probe destroy by orbit spielberg-g space p...
28618                      dad clarifi thi not a food stop
Name: headline, Length: 28619, dtype: object

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features = 5000)
X = list(X)
X = tv.fit_transform(X).toarray()

In [42]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [44]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
lsvc = LinearSVC()
lsvc.fit(X_train, y_train)
print(lsvc.score(X_train, y_train))
print(lsvc.score(X_test, y_test))

0.9059355199403653
0.8190076869322153


In [54]:
lsvc.predict(tv.transform(['green']))[0]

0

In [50]:
import pickle

pickle.dump(tv, open("tfidf_sarcasm.pickle", "wb"))
pickle.dump(lsvc, open("lsvc_sarcasm.pickle", "wb"))

In [1]:
#when actually pickling the LSVC, train it on the whole dataset!