In [3]:
import pandas as pd
import nltk
from nltk.probability import *
from itertools import chain

In [4]:
base_path = "tokenized_data/"

train_data = pd.read_excel(f"{base_path}cleaned_ISOT_data.xlsx")

In [5]:
train_data = train_data.dropna(subset=['text'])

In [6]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,4349,plans sanctions syria future treasury's mnuchin,palm beach fla reuters treasury secretary stev...,politicsnews,"april 7, 2017",1
1,15627,hezbollah saudi arabia forced lebanese quit,beirut reuters lebanon shi ite hezbollah sunda...,worldnews,"november 5, 2017",1
2,35496,brilliant liberal senator embarrass priest sch...,father robert sirico president co-founder acto...,politics,"apr 19, 2016",0
3,24375,tiffany trump mother expensive service rich,freelancers love entitled people entitled peop...,news,"january 18, 2017",0
4,19209,australian hindus protest meat advertisement f...,sydney reuters hundreds people attended rallie...,worldnews,"september 24, 2017",1


In [7]:
train_data.drop('Unnamed: 0', axis=1, inplace=True)

In [9]:
train_data.head()

Unnamed: 0,title,text,subject,date,label
0,plans sanctions syria future treasury's mnuchin,palm beach fla reuters treasury secretary stev...,politicsnews,"april 7, 2017",1
1,hezbollah saudi arabia forced lebanese quit,beirut reuters lebanon shi ite hezbollah sunda...,worldnews,"november 5, 2017",1
2,brilliant liberal senator embarrass priest sch...,father robert sirico president co-founder acto...,politics,"apr 19, 2016",0
3,tiffany trump mother expensive service rich,freelancers love entitled people entitled peop...,news,"january 18, 2017",0
4,australian hindus protest meat advertisement f...,sydney reuters hundreds people attended rallie...,worldnews,"september 24, 2017",1


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [13]:
v = TfidfVectorizer()
transformed_output = v.fit_transform(train_data['text'])
print(list(v.vocabulary_.items())[:10])

[('palm', 67998), ('beach', 8001), ('fla', 32482), ('reuters', 78242), ('treasury', 95107), ('secretary', 82746), ('steve', 88708), ('mnuchin', 60027), ('friday', 33958), ('told', 94141)]


In [15]:
all_feature_names = v.get_feature_names_out()

indexes = []
# Finding weights of words across the whole train_data df
for word in all_feature_names:
    index = v.vocabulary_.get(word)
    indexes.append(f"{word} {v.idf_[index]}")

In [16]:
indexes[:30]

['aa 8.395448738691082',
 'aaa 9.20637895490741',
 'aaaaaaaand 10.998138424135465',
 'aaaaackkk 10.998138424135465',
 'aaaahhhh 10.998138424135465',
 'aaaand 10.998138424135465',
 'aaaarrgh 10.998138424135465',
 'aaab 10.998138424135465',
 'aaarf 10.998138424135465',
 'aab 10.998138424135465',
 'aaba 10.998138424135465',
 'aabfsv 10.998138424135465',
 'aabo 10.998138424135465',
 'aaccording 10.998138424135465',
 'aachen 10.5926733160273',
 'aadhaar 10.30499124357552',
 'aadhar 10.998138424135465',
 'aadl 10.5926733160273',
 'aaf 10.30499124357552',
 'aahd 10.08184769226131',
 'aai 10.998138424135465',
 'aaja 10.998138424135465',
 'aal 9.388700511701364',
 'aalberg 10.998138424135465',
 'aalberts 10.998138424135465',
 'aaldef 10.998138424135465',
 'aaliyah 10.998138424135465',
 'aamer 9.899526135467354',
 'aamin 10.30499124357552',
 'aammir 10.998138424135465']

In [19]:
X = train_data['text']
y = train_data['label']

In [39]:
from sklearn.model_selection import train_test_split

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=7, stratify=y)

In [43]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (35176,)
Shape of X_test:  (8794,)


In [45]:
y_train.value_counts()

label
0    18208
1    16968
Name: count, dtype: int64

In [47]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
# First classifier for testing
from sklearn.metrics import classification_report
# Second classifier for testing
from sklearn.naive_bayes import MultinomialNB
# Third classifier for testing
from sklearn.ensemble import RandomForestClassifier

### KNeighbors Classifier Results For LIAR

In [50]:
import numpy as np

In [52]:
# Create pipeline for KNeighbors
clf_KNN = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

In [54]:
# Fit with X_train and y_train
clf_KNN.fit(X_train, y_train)

In [56]:
# Get predictions for X_test and store it in y_pred
y_pred_KNN = clf_KNN.predict(X_test)

In [57]:
# Print results
print(classification_report(y_test, y_pred_KNN))

              precision    recall  f1-score   support

           0       0.92      0.81      0.86      4552
           1       0.82      0.93      0.87      4242

    accuracy                           0.87      8794
   macro avg       0.87      0.87      0.87      8794
weighted avg       0.87      0.87      0.87      8794



In [60]:
np.savetxt("KNN_pred_results.txt", np.column_stack((y_test, y_pred_KNN)), fmt="%s", header="y_test y_pred")

In [62]:
y_test[:10]

5459     0
440      1
5226     1
32423    1
19734    1
33105    1
21743    0
32591    0
41727    0
4451     1
Name: label, dtype: int64

In [64]:
y_pred_KNN[:10]

array([0, 1, 1, 1, 1, 1, 0, 0, 1, 1], dtype=int64)

### MultinominalNB Classifier Results For LIAR

In [67]:
# Create pipeline for MultinomialNB
clf_MNB = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

In [69]:
# Fit with X_train and y_train
clf_MNB.fit(X_train, y_train)

In [70]:
# Get predictions for X_test and store it in y_pred
y_pred_MNB = clf_MNB.predict(X_test)

In [71]:
# Print results
print(classification_report(y_test, y_pred_MNB))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      4552
           1       0.93      0.93      0.93      4242

    accuracy                           0.93      8794
   macro avg       0.93      0.93      0.93      8794
weighted avg       0.93      0.93      0.93      8794



In [72]:
np.savetxt("MNB_pred_results.txt", np.column_stack((y_test, y_pred_MNB)), fmt="%s", header="y_test y_pred")

### Random Forest Classifier Results For LIAR

In [78]:
# Create pipeline for Random Forest Classifier
clf_RFC = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

In [80]:
# Fit with X_train and y_train
clf_RFC.fit(X_train, y_train)

In [81]:
# Get predictions for X_test and store it in y_pred
y_pred_RFC = clf_RFC.predict(X_test)

In [82]:
# Print results
print(classification_report(y_test, y_pred_RFC))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4552
           1       0.99      0.99      0.99      4242

    accuracy                           0.99      8794
   macro avg       0.99      0.99      0.99      8794
weighted avg       0.99      0.99      0.99      8794



In [83]:
np.savetxt("RFC_pred_results.txt", np.column_stack((y_test, y_pred_RFC)), fmt="%s", header="y_test y_pred")

### Combine Title to Improve Results

In [89]:
# Combine columns
train_data['Combined Text'] = train_data['title'] + " " + train_data['text']

In [91]:
train_data.head()

Unnamed: 0,title,text,subject,date,label,Combined Text
0,plans sanctions syria future treasury's mnuchin,palm beach fla reuters treasury secretary stev...,politicsnews,"april 7, 2017",1,plans sanctions syria future treasury's mnuchi...
1,hezbollah saudi arabia forced lebanese quit,beirut reuters lebanon shi ite hezbollah sunda...,worldnews,"november 5, 2017",1,hezbollah saudi arabia forced lebanese quit be...
2,brilliant liberal senator embarrass priest sch...,father robert sirico president co-founder acto...,politics,"apr 19, 2016",0,brilliant liberal senator embarrass priest sch...
3,tiffany trump mother expensive service rich,freelancers love entitled people entitled peop...,news,"january 18, 2017",0,tiffany trump mother expensive service rich fr...
4,australian hindus protest meat advertisement f...,sydney reuters hundreds people attended rallie...,worldnews,"september 24, 2017",1,australian hindus protest meat advertisement f...


In [95]:
X_combined = train_data['Combined Text'].astype(str)
y_combined = train_data['label']

In [97]:
X_combined_train, X_combined_test, y_combined_train, y_combined_test = train_test_split(
    X_combined, y_combined, test_size=0.2, random_state=7, stratify=y)

### Combined KNeighbors Classifier Results For LIAR

In [100]:
# Create pipeline for KNeighbors
clf_Combined_KNN = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

In [102]:
# Fit with X_train and y_train
clf_Combined_KNN.fit(X_combined_train, y_combined_train)

In [103]:
# Get predictions for X_test and store it in y_pred
y_pred_combined_KNN = clf_Combined_KNN.predict(X_combined_test)

In [104]:
# Print results
print(classification_report(y_combined_test, y_pred_combined_KNN))

              precision    recall  f1-score   support

           0       0.92      0.81      0.86      4552
           1       0.82      0.93      0.87      4242

    accuracy                           0.87      8794
   macro avg       0.87      0.87      0.87      8794
weighted avg       0.87      0.87      0.87      8794



In [105]:
np.savetxt("KNN_combined_pred_results.txt", np.column_stack((y_combined_test, y_pred_combined_KNN)), fmt="%s", header="y_test y_pred")

### Combined MultinominalNB Classifier Results For LIAR

In [111]:
# Create pipeline for MultinomialNB
clf_Combined_MNB = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

In [113]:
# Fit with X_train and y_train
clf_Combined_MNB.fit(X_combined_train, y_combined_train)

In [114]:
# Get predictions for X_test and store it in y_pred
y_pred_combined_MNB = clf_Combined_MNB.predict(X_combined_test)

In [115]:
# Print results
print(classification_report(y_combined_test, y_pred_combined_MNB))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      4552
           1       0.93      0.93      0.93      4242

    accuracy                           0.93      8794
   macro avg       0.93      0.93      0.93      8794
weighted avg       0.93      0.93      0.93      8794



In [116]:
np.savetxt("MNB_combined_pred_results.txt", np.column_stack((y_combined_test, y_pred_combined_MNB)), fmt="%s", header="y_test y_pred")

### Combined Random Forest Classifier Results For LIAR

In [118]:
# Create pipeline for Random Forest Classifier
clf_Combined_RFC = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

In [119]:
# Fit with X_train and y_train
clf_Combined_RFC.fit(X_combined_train, y_combined_train)

In [120]:
# Get predictions for X_test and store it in y_pred
y_pred_combined_RFC = clf_Combined_RFC.predict(X_combined_test)

In [121]:
# Print results
print(classification_report(y_combined_test, y_pred_combined_RFC))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4552
           1       0.99      0.99      0.99      4242

    accuracy                           0.99      8794
   macro avg       0.99      0.99      0.99      8794
weighted avg       0.99      0.99      0.99      8794



In [122]:
np.savetxt("RFC_combined_pred_results.txt", np.column_stack((y_combined_test, y_pred_combined_RFC)), fmt="%s", header="y_test y_pred")