In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_table('../Datasets/liar_dataset/train.tsv', names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party",
                                            "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"])

val_data = pd.read_table('../Datasets/liar_dataset/valid.tsv', names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party",
                                            "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"])

test_data = pd.read_table('../Datasets/liar_dataset/test.tsv', names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party",
                                            "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"])
print (train_data.shape, val_data.shape, test_data.shape)
print (train_data.label.unique())
train_data.head()

(10240, 14) (1284, 14) (1267, 14)
['false' 'half-true' 'mostly-true' 'true' 'barely-true' 'pants-fire']


Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely-true,false,half-true,mostly-true,pants-fire,venue
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 14 columns):
id             10240 non-null object
label          10240 non-null object
statement      10240 non-null object
subject        10238 non-null object
speaker        10238 non-null object
job            7343 non-null object
state          8032 non-null object
party          10238 non-null object
barely-true    10238 non-null float64
false          10238 non-null float64
half-true      10238 non-null float64
mostly-true    10238 non-null float64
pants-fire     10238 non-null float64
venue          10138 non-null object
dtypes: float64(5), object(9)
memory usage: 1.1+ MB


In [3]:
#####################
### OUTPUT LABELS ###
#####################
y_label_dict = {"pants-fire" : 0, "false" : 0, "barely-true" : 0, "half-true" : 0, "mostly-true" : 0, "true" : 1}
print (y_label_dict)

train_data['output'] = train_data['label'].apply(lambda x: y_label_dict[x])
val_data['output'] = val_data['label'].apply(lambda x: y_label_dict[x])
test_data['output'] = test_data['label'].apply(lambda x: y_label_dict[x])

num_classes = 6

{'pants-fire': 0, 'false': 0, 'barely-true': 0, 'half-true': 0, 'mostly-true': 0, 'true': 1}


In [4]:
val_data.head()

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely-true,false,half-true,mostly-true,pants-fire,venue,output
0,12134.json,barely-true,We have less Americans working now than in the...,"economy,jobs",vicky-hartzler,U.S. Representative,Missouri,republican,1,0,1,0,0,an interview with ABC17 News,0
1,238.json,pants-fire,"When Obama was sworn into office, he DID NOT u...","obama-birth-certificate,religion",chain-email,,,none,11,43,8,5,105,,0
2,7891.json,false,Says Having organizations parading as being so...,"campaign-finance,congress,taxes",earl-blumenauer,U.S. representative,Oregon,democrat,0,1,1,1,0,a U.S. Ways and Means hearing,0
3,8169.json,half-true,Says nearly half of Oregons children are poor.,poverty,jim-francesconi,Member of the State Board of Higher Education,Oregon,none,0,1,1,1,0,an opinion article,0
4,929.json,half-true,On attacks by Republicans that various program...,"economy,stimulus",barack-obama,President,Illinois,democrat,70,71,160,163,9,interview with CBS News,0


In [5]:
test_data.head()

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely-true,false,half-true,mostly-true,pants-fire,venue,output
0,11972.json,true,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview,1
1,11685.json,false,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,State representative,Wisconsin,democrat,2,1,0,0,0,a news conference,0
2,11096.json,false,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,President-Elect,New York,republican,63,114,51,37,61,comments on ABC's This Week.,0
3,5209.json,half-true,Suzanne Bonamici supports a plan that will cut...,"medicare,message-machine-2012,campaign-adverti...",rob-cornilles,consultant,Oregon,republican,1,1,3,1,1,a radio show,0
4,9524.json,pants-fire,When asked by a reporter whether hes at the ce...,"campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,,Wisconsin,democrat,5,7,2,2,7,a web video,0


In [6]:
X_train = train_data['statement']
Y_train = train_data['output']
X_test = test_data['statement']
Y_test = test_data['output']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_classification_model = Pipeline([('tfidf', TfidfVectorizer()), ('bayes', MultinomialNB())])
text_classification_model = text_classification_model.fit(X_train, Y_train)
predicted = text_classification_model.predict(X_test)
np.mean(predicted==Y_test)

0.8358326756116812

In [8]:
from sklearn.linear_model import SGDClassifier
tc_model_svm = Pipeline([('tfidf', TfidfVectorizer()), ('svm', SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2',random_state=42))])
tc_model_svm = tc_model_svm.fit(X_train, Y_train)
predicted_svm = tc_model_svm.predict(X_test)
np.mean(predicted_svm == Y_test)

0.8358326756116812