## **Text pre-processing & train/test set construction**

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import nltk
import itertools

In [2]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [3]:
# Read the data

In [4]:
with open("op_spam_v1.4/negative_polarity/deceptive_train.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

deceptive_train = pd.DataFrame(lines, columns=["Text"])
deceptive_train["Label"] = 0

In [5]:
with open("op_spam_v1.4/negative_polarity/truthful_train.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

truthful_train = pd.DataFrame(lines, columns=["Text"])
truthful_train["Label"] = 1

In [6]:
with open("op_spam_v1.4/negative_polarity/deceptive_test.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

deceptive_test = pd.DataFrame(lines, columns=["Text"])
deceptive_test["Label"] = 0

In [7]:
with open("op_spam_v1.4/negative_polarity/truthful_test.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

truthful_test = pd.DataFrame(lines, columns=["Text"])
truthful_test["Label"] = 1

In [8]:
# Concat deceptive and truthful dataframes

In [None]:
df = pd.concat([deceptive_train, truthful_train, deceptive_test, truthful_test], axis=0).reset_index(drop=True)
df
#640-800 = test set

#### Further text pre-processing 

- Tokenization
- Lower-casing
- Punctuation & Special character removal
- Spelling correction
- Stop-word removal
- (Stemming (Porter)) *Skip for now

In [3]:
from nltk.corpus import stopwords # stopwords.words('english')
import string
import re
from textblob import TextBlob

In [10]:
df["Text"] = df["Text"].apply(lambda x: [word for word in nltk.word_tokenize(x)]) # Tokenize
df["Text"] = df["Text"].apply(lambda x: [word.lower() for word in x]) # Apply lower-casing
df["Text"] = df["Text"].apply(lambda x: [word for word in x if word not in string.punctuation]) # Punctuation removal
df["Text"] = df["Text"].apply(lambda x: [word for word in x if word not in ' '.join(stopwords.words('english'))]) # Stop word removal
df["Text"] = df["Text"].apply(lambda x: [re.sub("(?:\W|\d)+", "", word) for word in x]) # Removing special chars and numbers
df["Text"] = df["Text"].apply(lambda x: [word for word in x if word != ""]) # Remove empty strings
df["Text"] = df["Text"].apply(lambda x: [str(TextBlob(word).correct()) for word in x]) # Spelling correction

In [4]:
# Unpickle the df. Pickled the df in case the kernel dies when fitting the models, because the spelling correction above takes a while to execute
df = pd.read_pickle("./df.pkl")
df.head()

Unnamed: 0,Text,Label
0,"[hotel, located, mile, train, station, quite, ...",0
1,"[made, reservation, hilton, chicago, believing...",0
2,"[people, think, hilton, think, luxury, know, w...",0
3,"[husband, recently, stayed, stayed, hilton, ch...",0
4,"[wife, booked, room, hilton, chicago, three, w...",0


In [5]:
# Convert back to str to construct dtm with sklearn CountVectorizer
df["Text"] = df["Text"].apply(lambda x: ' '.join(x))

In [6]:
df.head()

Unnamed: 0,Text,Label
0,hotel located mile train station quite like tr...,0
1,made reservation hilton chicago believing goin...,0
2,people think hilton think luxury know wish hal...,0
3,husband recently stayed stayed hilton chicago ...,0
4,wife booked room hilton chicago three weekend ...,0


In [7]:
print(df["Text"][6])

high hopes hilton chicago sad say disappointed outrageous expensive two people one night expect pay park car offer free wife instead pay get internet room wait pm check even though flight morning rent car airport hotel offer transportation stress hilton chicago hotel bar either doubt stay


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

#Set ngram=1,1, 1,2 and 2,2 (only bigrams) and min_df (float 0-1) for Naive Bayes when needed (use diff thresholds 0.005 increment starting at 1% -> 10%).
vectorizer = CountVectorizer(ngram_range=(1,1), min_df=1, lowercase=False)
X = vectorizer.fit_transform(df["Text"])
# X.toarray()

In [9]:
X.shape

(800, 6504)

In [10]:
vectorizer.get_feature_names_out()[5]

'abrupt'

#### Seperate train and test set

In [11]:
X_train, y_train = X[0:640], df["Label"][0:640].to_numpy()
X_test, y_test = X[640:], df["Label"][640:].to_numpy()

## **Models using unigram only**

In [12]:
from sklearn.model_selection import GridSearchCV

In [62]:
from sklearn.metrics import confusion_matrix

def performance_metrics(y_true, y_pred):
    cf_matrix = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cf_matrix.ravel()
    
    accuracy = (tn+tp)/(tn+tp+fp+fn)
    recall =  (tp)/(tp+fn)
    precision = (tp)/(tp+fp)
    f1 = 2 * ((precision*recall)/(precision+recall))
    
    print(cf_matrix)
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'F1: {f1:.2f}')
    
    return (accuracy, recall, precision, f1)

#### Naive Bayes

In [63]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [94]:
def NB():
    
    # Find best threshold
    
    thresholds = list(range(1,11))
    cv_scores = []
    for thresh in thresholds:
        vectorizer = CountVectorizer(ngram_range=(1,1), min_df=thresh, lowercase=False)
        X = vectorizer.fit_transform(df["Text"]) 
        X_train = X[0:640] # Only need to change X_train/test, y_train/test remains same (only removing/adding columns, not rows)
        X_test = X[640:]
        score = cross_val_score(MultinomialNB(), X_train, y_train, n_jobs=-1)
        cv_scores.append((score.mean(), X_train, X_test, thresh))
    
    return max(cv_scores, key=lambda x: x[0]) # Return X_train/X_test for which CV score was best (best doc frequency threshold)
        
best_thresh = NB()  # (score.mean(), X_train, X_test, {ngram: (x,y), min_df: x})

In [95]:
best_thresh

(0.8421875,
 <640x1601 sparse matrix of type '<class 'numpy.int64'>'
 	with 37473 stored elements in Compressed Sparse Row format>,
 <160x1601 sparse matrix of type '<class 'numpy.int64'>'
 	with 8891 stored elements in Compressed Sparse Row format>,
 6)

In [96]:
# Retrain on whole training set
X_train_nb, X_test_nb = best_thresh[1], best_thresh[2]
nb_clf = MultinomialNB()
nb_clf.fit(X_train_nb, y_train) 

MultinomialNB()

In [101]:
y_pred = nb_clf.predict(X_test_nb)
performance_metrics(y_test, y_pred)

[[68 12]
 [12 68]]
Accuracy: 0.85
Recall: 0.85
Precision: 0.85
F1: 0.85


(0.85, 0.85, 0.85, 0.85)

In [102]:
_file = open('nb_uni.pkl', "wb")
pickle.dump(nb_clf, _file)
_file.close()

#### Logistic regression

In [66]:
from sklearn.linear_model import LogisticRegression

In [67]:
alphas = np.arange(0, 10, 0.01)
alphas = np.append(alphas, [10, 20, 30, 40, 50, 100])

params = {'C': alphas}

lr_clf = GridSearchCV(LogisticRegression(penalty='l1', solver='liblinear'), param_grid=params, verbose=1, n_jobs=-1)

In [None]:
lr_clf.fit(X_train, y_train)

In [69]:
lr_clf.best_params_

{'C': 0.64}

In [70]:
y_pred = lr_clf.predict(X_test)
performance_metrics(y_test, y_pred)

[[68 12]
 [11 69]]
Accuracy: 0.86
Recall: 0.86
Precision: 0.85
F1: 0.86


(0.85625, 0.8625, 0.8518518518518519, 0.8571428571428572)

In [284]:
_file = open('lr_uni.pkl', "wb")
pickle.dump(lr_clf, _file)
_file.close()

#### Decision Tree

In [294]:
from sklearn.tree import DecisionTreeClassifier

In [295]:
# Hyperparameters to search
alphas = np.arange(0, 10, 0.01)
alphas = np.append(alphas, [10, 20, 30, 40, 50, 100])
alphas = [round(a, 2) for a in alphas]
nmin = list(range(2, 21))
minleaf = list(range(1, 11))

params = {'min_samples_split': nmin, 'min_samples_leaf': minleaf, 'ccp_alpha': alphas}

dt_clf = GridSearchCV(DecisionTreeClassifier(), param_grid=params, verbose=1, n_jobs=-1)

In [296]:
dt_clf.fit(X_train, y_train)

Fitting 5 folds for each of 191140 candidates, totalling 955700 fits


GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'ccp_alpha': [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06,
                                       0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13,
                                       0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2,
                                       0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27,
                                       0.28, 0.29, ...],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                               12, 13, 14, 15, 16, 17, 18, 19,
                                               20]},
             verbose=1)

In [300]:
dt_clf.best_params_

{'ccp_alpha': 0.01, 'min_samples_leaf': 6, 'min_samples_split': 2}

In [302]:
_file = open('dt_uni.pkl', "wb")
pickle.dump(dt_clf, _file)
_file.close()

In [72]:
_file = open('dt_uni.pkl', 'rb')
dt_clf = pickle.load(_file)
_file.close()

In [73]:
y_pred = dt_clf.predict(X_test)
performance_metrics(y_test, y_pred)

[[40 40]
 [13 67]]
Accuracy: 0.67
Recall: 0.84
Precision: 0.63
F1: 0.72


(0.66875, 0.8375, 0.6261682242990654, 0.7165775401069518)

#### Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
import itertools
from IPython.display import clear_output
import multiprocessing as mp

In [42]:
nmin = list(range(2, 21))
minleaf = list(range(1, 11))
ntrees = [50, 100, 200, 300, 500]
nfeats = list(range(10, 201, 10))

cartesian_product = list(itertools.product(nmin, minleaf, ntrees, nfeats))

params = [p for p in cartesian_product if p[1] * 2 <= p[0]]

def train_rf(params):
    
    # Estimate best RF hyperparameters using OOB performance instead of CV
    
    best_rf_clf = None
    i = 0
    for p in params:
        rf_clf = RandomForestClassifier(n_estimators=p[2], min_samples_split=p[0],
                                    min_samples_leaf=p[1], max_features=p[3],
                                    oob_score=True, n_jobs=-1)
        rf_clf.fit(X_train, y_train)
        
        if best_rf_clf is None:
            best_rf_clf = rf_clf
            
        if rf_clf.oob_score_ > best_rf_clf.oob_score_:
            best_rf_clf = rf_clf
            
        clear_output(wait=True)
    
        i+=1
        
        print(f'Iteration: {i}, OOB: {round(rf_clf.oob_score_, 2)}') 
    
    return (best_rf_clf, best_rf_clf.oob_score_)

best_rf_clf = train_rf(params)

Iteration: 10000, OOB: 0.81


In [44]:
len(params)

10000

In [45]:
best_rf_clf

(RandomForestClassifier(max_features=40, min_samples_leaf=3,
                        min_samples_split=12, n_estimators=500, n_jobs=-1,
                        oob_score=True),
 0.875)

In [47]:
_file = open('rf_uni.pkl', "wb")
pickle.dump(best_rf_clf, _file)
_file.close()

In [78]:
_file = open('rf_uni.pkl', 'rb')
rf_clf = pickle.load(_file)
_file.close()

In [80]:
y_pred = rf_clf[0].predict(X_test)
performance_metrics(y_test, y_pred)

[[66 14]
 [ 6 74]]
Accuracy: 0.88
Recall: 0.93
Precision: 0.84
F1: 0.88


(0.875, 0.925, 0.8409090909090909, 0.8809523809523809)

## **Models using bigrams**

#### Naive Bayes

In [42]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [113]:
def NB():
    
    # Find best threshold
    
    thresholds = list(range(1,11))
    cv_scores = []
    for thresh in thresholds:
        vectorizer = CountVectorizer(ngram_range=(1,2), min_df=thresh, lowercase=False)
        X = vectorizer.fit_transform(df["Text"]) 
        X_train = X[0:640] # Only need to change X_train/test, y_train/test remains same (only removing/adding columns, not rows)
        X_test = X[640:]
        score = cross_val_score(MultinomialNB(), X_train, y_train, n_jobs=-1)
        cv_scores.append((score.mean(), X_train, X_test, thresh))
    
    return max(cv_scores, key=lambda x: x[0]) # Return X_train/X_test for which CV score was best (best doc frequency threshold)
        
best_thresh = NB()  # (score.mean(), X_train, X_test, {ngram: (x,y), min_df: x})

In [114]:
best_thresh

(0.85,
 <640x2746 sparse matrix of type '<class 'numpy.int64'>'
 	with 45444 stored elements in Compressed Sparse Row format>,
 <160x2746 sparse matrix of type '<class 'numpy.int64'>'
 	with 10849 stored elements in Compressed Sparse Row format>,
 5)

In [115]:
# Retrain on whole training set
X_train_nb, X_test_nb = best_thresh[1], best_thresh[2]
nb_clf = MultinomialNB()
nb_clf.fit(X_train_nb, y_train) 

MultinomialNB()

In [116]:
y_pred = nb_clf.predict(X_test_nb)
performance_metrics(y_test, y_pred)

[[63 17]
 [ 4 76]]
Accuracy: 0.87
Recall: 0.95
Precision: 0.82
F1: 0.88


(0.86875, 0.95, 0.8172043010752689, 0.8786127167630059)

In [117]:
_file = open('nb_bi.pkl', "wb")
pickle.dump(nb_clf, _file)
_file.close()

#### Logistic Regression

In [131]:
from sklearn.linear_model import LogisticRegression

In [132]:
alphas = np.arange(0, 10, 0.01)
alphas = np.append(alphas, [10, 20, 30, 40, 50, 100])

params = {'C': alphas}

lr_clf = GridSearchCV(LogisticRegression(penalty='l1', solver='liblinear'), param_grid=params, verbose=1, n_jobs=-1)

In [133]:
vectorizer_bi = CountVectorizer(ngram_range=(1,2), min_df=1, lowercase=False)
X_bi = vectorizer_bi.fit_transform(df["Text"])

In [134]:
X_train_lr, X_test_lr = X_bi[0:640], X_bi[640:]

In [None]:
lr_clf.fit(X_train_lr, y_train)

In [136]:
lr_clf.best_params_

{'C': 0.52}

In [137]:
y_pred = lr_clf.predict(X_test_lr)
performance_metrics(y_test, y_pred)

[[69 11]
 [ 9 71]]
Accuracy: 0.88
Recall: 0.89
Precision: 0.87
F1: 0.88


(0.875, 0.8875, 0.8658536585365854, 0.8765432098765432)

In [138]:
_file = open('lr_bi.pkl', "wb")
pickle.dump(lr_clf, _file)
_file.close()

In [164]:
lr_test = LogisticRegression(C=0.52, penalty='l1', solver='liblinear')
lr_test.fit(X_train_lr, y_train)

LogisticRegression(C=0.52, penalty='l1', solver='liblinear')

In [165]:
y_pred_test = lr_test.predict(X_test_lr)
performance_metrics(y_test, y_pred_test)

[[69 11]
 [ 9 71]]
Accuracy: 0.88
Recall: 0.89
Precision: 0.87
F1: 0.88


(0.875, 0.8875, 0.8658536585365854, 0.8765432098765432)

In [227]:
coefficients = lr_test.coef_.flatten()
nonzero_feature_idxs = lr_test.coef_.flatten().nonzero()[0]
nonzero_feature_idxs

array([ 1188,  1566,  2128,  2448,  2592,  2607,  3300,  4282,  4539,
        4768,  5651,  5670,  5937,  6085,  6604,  6682,  6968,  7589,
        7694,  7923,  8040,  8295,  8426,  8643,  8993,  9093,  9193,
        9223,  9250,  9568,  9836, 10076, 10417, 10667, 10865, 10964,
       11862, 12242, 12381, 12834, 13433, 13906, 14856, 14910, 14967,
       15040, 15694, 16004, 16341, 16428, 16591, 17560, 17739, 18332,
       18597, 18914, 19098, 19354, 19726, 19872, 20688, 21088, 21466,
       22092, 22333, 22547, 22668, 22768, 23020, 23121, 23375, 23757,
       24283, 25031, 25992, 26426, 27183, 27498, 27785, 27968, 28505,
       28782, 29101, 29336, 29436, 29678, 30095, 30220, 30238, 31074,
       31639, 31779, 32056, 32321, 33324, 33431, 33800, 34759, 34981,
       35141, 35468, 35632, 35700, 36888, 36910, 37065, 37127, 37602,
       37833, 37986, 38573, 38585, 38691, 39004, 39161, 39455, 39505,
       39632, 40007, 40085, 40432, 41728, 42009, 42416, 42552, 42908,
       43507, 44747,

In [229]:
len(nonzero_feature_idxs)

163

Only 163 features non-zero after training LR with lasso penalty

In [290]:
# Top 5 deceptive
top5_deceptive_coeffs = np.sort(coefficients[nonzero_feature_idxs])[0:5]
top5_deceptive_coeffs

array([-1.24203991, -0.97158517, -0.93682403, -0.90479762, -0.89272768])

In [303]:
# Top 5 deceptive idxs
top5_deceptive_idxs = []
for coeff in top5_deceptive_coeffs:
    top5_deceptive_idxs.append(np.where(coefficients == coeff)[0][0])
top5_deceptive_idxs

[23121, 39004, 28782, 8295, 17739]

In [308]:
vectorizer_bi.get_feature_names_out()[[23121, 39004, 28782, 8295, 17739]]

array(['hotel chicago', 'relax', 'luxury', 'chicago', 'finally'],
      dtype=object)

In [320]:
# Top 5 truthful
top5_truthful_coeffs = np.sort(coefficients[nonzero_feature_idxs])[-5:][::-1]
top5_truthful_coeffs

array([1.4309576 , 0.82024089, 0.80038729, 0.73804644, 0.73420736])

In [321]:
# Top 5 truthful
top5_truthful_idxs = []
for coeff in top5_truthful_coeffs:
    top5_truthful_idxs.append(np.where(coefficients == coeff)[0][0])
top5_truthful_idxs

[46381, 10417, 47429, 27968, 10865]

In [322]:
vectorizer_bi.get_feature_names_out()[[46381, 10417, 47429, 27968, 10865]]

array(['star', 'conference', 'street', 'location', 'converge'],
      dtype=object)

#### Decision Tree

In [139]:
from sklearn.tree import DecisionTreeClassifier

In [140]:
# Hyperparameters to search
alphas = np.arange(0, 10, 0.01)
alphas = np.append(alphas, [10, 20, 30, 40, 50, 100])
alphas = [round(a, 2) for a in alphas]
nmin = list(range(2, 21))
minleaf = list(range(1, 11))

params = {'min_samples_split': nmin, 'min_samples_leaf': minleaf, 'ccp_alpha': alphas}

dt_clf = GridSearchCV(DecisionTreeClassifier(), param_grid=params, verbose=1, n_jobs=-1)

In [141]:
vectorizer_bi = CountVectorizer(ngram_range=(1,2), min_df=1, lowercase=False)
X_bi = vectorizer_bi.fit_transform(df["Text"])

In [142]:
X_train_dt, X_test_dt = X_bi[0:640], X_bi[640:]

In [143]:
dt_clf.fit(X_train_dt, y_train)

Fitting 5 folds for each of 191140 candidates, totalling 955700 fits


GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'ccp_alpha': [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06,
                                       0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13,
                                       0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2,
                                       0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27,
                                       0.28, 0.29, ...],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                               12, 13, 14, 15, 16, 17, 18, 19,
                                               20]},
             verbose=1)

In [144]:
dt_clf.best_params_

{'ccp_alpha': 0.01, 'min_samples_leaf': 9, 'min_samples_split': 2}

In [145]:
_file = open('dt_bi.pkl', "wb")
pickle.dump(dt_clf, _file)
_file.close()

In [146]:
y_pred = dt_clf.predict(X_test_dt)
performance_metrics(y_test, y_pred)

[[40 40]
 [13 67]]
Accuracy: 0.67
Recall: 0.84
Precision: 0.63
F1: 0.72


(0.66875, 0.8375, 0.6261682242990654, 0.7165775401069518)

#### Random Forest

In [118]:
from sklearn.ensemble import RandomForestClassifier
import itertools
from IPython.display import clear_output

In [122]:
nmin = list(range(2, 21))
minleaf = list(range(1, 11))
ntrees = [50, 100, 200, 300, 500]
nfeats = list(range(10, 500, 10))

cartesian_product = list(itertools.product(nmin, minleaf, ntrees, nfeats))

params = [p for p in cartesian_product if p[1] * 2 <= p[0]]

def train_rf(p):
    
    # Estimate best RF hyperparameters using OOB performance instead of CV
    
    vectorizer = CountVectorizer(ngram_range=(1,2), min_df=1, lowercase=False)
    X = vectorizer.fit_transform(df["Text"]) 
    X_train = X[0:640]
    
    best_rf_clf = None
    i = 0
    for p in params:
    
        rf_clf = RandomForestClassifier(n_estimators=p[2], min_samples_split=p[0],
                                    min_samples_leaf=p[1], max_features=p[3],
                                    oob_score=True, n_jobs=-1)
        rf_clf.fit(X_train, y_train)
    
        if best_rf_clf is None:
            best_rf_clf = rf_clf
            
        if rf_clf.oob_score_ > best_rf_clf.oob_score_:
            best_rf_clf = rf_clf
            
        clear_output(wait=True)
    
        i+=1
        
        print(f'Iteration: {i}, OOB: {rf_clf.oob_score_}')
    
    return best_rf_clf


best_rf_clf = train_rf(params)

Iteration: 24500, OOB: 0.834375


In [123]:
len(params)

24500

In [124]:
best_rf_clf

RandomForestClassifier(max_features=200, min_samples_leaf=3,
                       min_samples_split=6, n_estimators=500, n_jobs=-1,
                       oob_score=True)

In [125]:
vectorizer_bi = CountVectorizer(ngram_range=(1,2), min_df=1, lowercase=False)
X_bi = vectorizer_bi.fit_transform(df["Text"])

In [126]:
X_test_rf = X_bi[640:]

In [129]:
y_pred = best_rf_clf.predict(X_test_rf)
performance_metrics(y_test, y_pred)

[[62 18]
 [ 6 74]]
Accuracy: 0.85
Recall: 0.93
Precision: 0.80
F1: 0.86


(0.85, 0.925, 0.8043478260869565, 0.8604651162790697)

In [130]:
_file = open('rf_bi.pkl', "wb")
pickle.dump(best_rf_clf, _file)
_file.close()

## **Statistical significance testing**