In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np


In [None]:
!pip install gdown==4.6.0
!gdown --folder https://drive.google.com/drive/u/1/folders/15Wn46r7gidaiZbx2ArFYsd7rjYH4y7JM

In [None]:

#open the csv files
csv_path_fake = os.path.join( './', 'News_dataset', 'Fake.csv')
csv_path_true = os.path.join( './', 'News_dataset', 'True.csv')

df_fake = pd.read_csv(csv_path_fake)
df_true = pd.read_csv(csv_path_true)


In [None]:
#add a label column to the dataframes
df_fake['label'] = 'fake'
df_true['label'] = 'true'

df = pd.concat([df_fake, df_true], ignore_index=True)

#save the dataframe to a csv file
df.to_csv('news.csv', index=False)

In [None]:
# open the csv file with pandas

df = pd.read_csv('news.csv')

#split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.1, random_state=7)

#initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)

#fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
def calculate_and_save_metrics(y_test, y_pred, model_name, label, first = False, last = False, results = None, savename = ''):

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print('Accuracy: ', accuracy)
    print('Precision: ', precision)
    print('Recall: ', recall)
    print('F1: ', f1)

    print(confusion_matrix(y_test, y_pred, labels=label))

    if first:
        results = pd.DataFrame([[accuracy, precision, recall, f1]], columns=['accuracy', 'precision', 'recall', 'f1'], index=[model_name])
    else:
        results.loc[model_name] = [accuracy, precision, recall, f1]

    if last:
        results.to_csv(savename)

    return results



In [None]:
label = ['fake', 'true']

#initialize a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

#predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'PassiveAggressiveClassifier', label, True)

Accuracy:  0.9944320712694877
Precision:  0.9944339701357502
Recall:  0.9944320712694877
F1:  0.9944317848085675
[[2336   10]
 [  15 2129]]


In [None]:
#with logistic regression
logreg = LogisticRegression()
logreg.fit(tfidf_train, y_train)
y_pred = logreg.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'LogisticRegression', label, results=results)

Accuracy:  0.9866369710467706
Precision:  0.9866368213705528
Recall:  0.9866369710467706
F1:  0.9866367000464401
[[2317   29]
 [  31 2113]]


In [None]:
#with multinomial naive bayes

nb = MultinomialNB()
nb.fit(tfidf_train, y_train)
y_pred = nb.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'MultinomialNB', label=label, results=results)

Accuracy:  0.9396436525612473
Precision:  0.939656760061686
Recall:  0.9396436525612473
F1:  0.9396295976850317
[[2221  125]
 [ 146 1998]]


In [None]:
# with KNN

knn = KNeighborsClassifier()
knn.fit(tfidf_train, y_train)
y_pred = knn.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'KNeighborsClassifier', label=label, results=results)


Accuracy:  0.6561247216035635
Precision:  0.773658716218475
Recall:  0.6561247216035635
F1:  0.6060047436568158
[[2319   27]
 [1517  627]]


In [None]:
# with a random forest classifier

rf = RandomForestClassifier()
rf.fit(tfidf_train, y_train)
y_pred = rf.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'RandomForestClassifier', label=label, results=results)

Accuracy:  0.9904231625835189
Precision:  0.9904265610194284
Recall:  0.9904231625835189
F1:  0.9904236313996031
[[2322   24]
 [  19 2125]]


In [None]:
# with a support vector machine

svc = LinearSVC(random_state=0, dual='auto')
svc.fit(tfidf_train, y_train)
y_pred = svc.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'LinearSVC', label=label, results=results, last=True, savename='results_kaggle.csv')

Accuracy:  0.9946547884187082
Precision:  0.9946576855359367
Recall:  0.9946547884187082
F1:  0.9946544568146701
[[2337    9]
 [  15 2129]]


In [None]:
# using Liar dataset

csv_path_liar_train = os.path.join( './', 'liar_dataset', 'train.tsv')
csv_path_liar_test = os.path.join( './', 'liar_dataset', 'test.tsv')

df_liar_train = pd.read_csv(csv_path_liar_train, sep='\t', header=None)
df_liar_test = pd.read_csv(csv_path_liar_test, sep='\t', header=None)

df_liar_train.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']
df_liar_test.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)

#fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(df_liar_train['statement'])
tfidf_test = tfidf_vectorizer.transform(df_liar_test['statement'])

label = ['true', 'mostly-true', 'half-true', 'barely-true', 'false', 'pants-fire']

In [None]:
#initialize a PassiveAggressiveClassifier

pac = PassiveAggressiveClassifier(max_iter=1000)
pac.fit(tfidf_train, df_liar_train['label'])

#predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'PassiveAggressiveClassifier', label, first=True)

Accuracy:  0.23362273086029992
Precision:  0.23245313772747705
Recall:  0.23362273086029992
F1:  0.2323593773721207
[[43 38 46 20 52  9]
 [50 52 50 37 37 15]
 [50 56 58 40 40 21]
 [33 45 29 38 46 21]
 [36 49 34 34 86 10]
 [13 10 15 15 20 19]]


In [None]:
# with logistic regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(tfidf_train, df_liar_train['label'])
y_pred = logreg.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'LogisticRegression', label, results=results)


Accuracy:  0.250197316495659
Precision:  0.24541341568325675
Recall:  0.250197316495659
F1:  0.24157143838446754
[[47 50 46 13 50  2]
 [50 56 65 25 44  1]
 [28 65 84 37 47  4]
 [22 37 53 44 55  1]
 [29 50 57 26 84  3]
 [ 7 14 25 13 31  2]]


In [None]:
# with multinomial naive bayes

nb = MultinomialNB()
nb.fit(tfidf_train, df_liar_train['label'])
y_pred = nb.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'MultinomialNB', label=label, results=results)

Accuracy:  0.23756906077348067
Precision:  0.2308894522430316
Recall:  0.23756906077348067
F1:  0.20623444528178886
[[ 15  59  79   3  52   0]
 [ 14  65 115   7  40   0]
 [  7  68 126  17  47   0]
 [  8  37  93  18  56   0]
 [ 12  51  99  10  77   0]
 [  2  18  36   7  29   0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# with KNN

knn = KNeighborsClassifier()
knn.fit(tfidf_train, df_liar_train['label'])
y_pred = knn.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'KNeighborsClassifier', label=label, results=results)

Accuracy:  0.21862667719021311
Precision:  0.2148326623861786
Recall:  0.21862667719021311
F1:  0.2133969072351237
[[31 48 39 33 48  9]
 [34 46 56 39 59  7]
 [24 49 64 59 63  6]
 [18 29 41 54 61  9]
 [26 38 52 46 76 11]
 [ 9  6 19 25 27  6]]


In [None]:
# with a random forest classifier

rf = RandomForestClassifier()
rf.fit(tfidf_train, df_liar_train['label'])
y_pred = rf.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'RandomForestClassifier', label=label, results=results)

Accuracy:  0.2565114443567482
Precision:  0.26151832390741203
Recall:  0.2565114443567482
F1:  0.24337528869577962
[[ 28  61  37  12  68   2]
 [ 32  72  56  20  60   1]
 [ 21  69  81  15  75   4]
 [ 17  35  44  36  80   0]
 [ 18  40  56  26 103   6]
 [  6  14  11  11  45   5]]


In [None]:
# with a support vector machine

svc = LinearSVC(random_state=0, dual='auto')
svc.fit(tfidf_train, df_liar_train['label'])
y_pred = svc.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'LinearSVC', label=label, results=results, last=True, savename='results_liar.csv')

Accuracy:  0.24546172059984214
Precision:  0.24380461077723303
Recall:  0.24546172059984214
F1:  0.24410137719093702
[[53 44 51 15 38  7]
 [52 49 53 36 42  9]
 [31 68 67 51 38 10]
 [24 31 47 56 44 10]
 [35 47 46 30 76 15]
 [10 12 21 17 22 10]]


In [None]:
# label true and mostly-true as true, and false, barely-true and pants-fire as false

df_liar_train['label'] = df_liar_train['label'].replace(['mostly-true'], 'true')
df_liar_test['label'] = df_liar_test['label'].replace(['mostly-true'], 'true')
df_liar_train['label'] = df_liar_train['label'].replace(['barely-true'], 'false')
df_liar_test['label'] = df_liar_test['label'].replace(['barely-true'], 'false')
df_liar_train['label'] = df_liar_train['label'].replace(['pants-fire'], 'false')
df_liar_test['label'] = df_liar_test['label'].replace(['pants-fire'], 'false')

#fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(df_liar_train['statement'])
tfidf_test = tfidf_vectorizer.transform(df_liar_test['statement'])

label = ['true', 'false']

In [None]:
# initialize a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=1000)
pac.fit(tfidf_train, df_liar_train['label'])

#predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'PassiveAggressiveClassifier', label, first=True)

Accuracy:  0.4127861089187056
Precision:  0.41427441941641197
Recall:  0.4127861089187056
F1:  0.41300643629512673
[[196 151]
 [192 268]]


In [None]:
# with logistic regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(tfidf_train, df_liar_train['label'])
y_pred = logreg.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'LogisticRegression', label, results=results)

Accuracy:  0.4846093133385951
Precision:  0.44625772367506816
Recall:  0.4846093133385951
F1:  0.4455332077814675
[[217 213]
 [149 381]]


In [None]:
# with multinomial naive bayes

nb = MultinomialNB()
nb.fit(tfidf_train, df_liar_train['label'])
y_pred = nb.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'MultinomialNB', label=label, results=results)

Accuracy:  0.4996053670086819
Precision:  0.603060107348248
Recall:  0.4996053670086819
F1:  0.4327770131024204
[[193 256]
 [114 439]]


In [None]:
# with KNN

knn = KNeighborsClassifier()
knn.fit(tfidf_train, df_liar_train['label'])
y_pred = knn.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'KNeighborsClassifier', label=label, results=results)

Accuracy:  0.4506708760852407
Precision:  0.4352822744630089
Recall:  0.4506708760852407
F1:  0.43511527535387057
[[164 227]
 [122 355]]


In [None]:
# with a random forest classifier

rf = RandomForestClassifier()
rf.fit(tfidf_train, df_liar_train['label'])
y_pred = rf.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'RandomForestClassifier', label=label, results=results)

Accuracy:  0.48303078137332284
Precision:  0.4685549554923833
Recall:  0.48303078137332284
F1:  0.4288359724279499
[[205 240]
 [148 401]]


In [None]:
# with a support vector machine

svc = LinearSVC(random_state=0, dual='auto')
svc.fit(tfidf_train, df_liar_train['label'])
y_pred = svc.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'LinearSVC', label=label, results=results, last=True, savename='results_liar_binary.csv')

Accuracy:  0.4554064719810576
Precision:  0.43187177447611486
Recall:  0.4554064719810576
F1:  0.4394366790071621
[[213 178]
 [167 329]]


In [None]:
# WELFake dataset

csv_path_welfake = os.path.join( './', 'WELFake', 'WELFake_Dataset.csv')

df_welfake = pd.read_csv(csv_path_welfake)

# drop the rows with np.nan values on text column
df_welfake = df_welfake.dropna(subset=['text'])

X_train, X_test, y_train, y_test = train_test_split(df_welfake['text'], df_welfake['label'], test_size=0.1, random_state=7)


In [None]:
# initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)

# fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

label = [0,1]

In [None]:
# initialize a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

# predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'PassiveAggressiveClassifier', label, first=True)

Accuracy:  0.9588072122052704
Precision:  0.9588822640330955
Recall:  0.9588072122052704
F1:  0.9588004132730461
[[3369  173]
 [ 124 3544]]


In [None]:
# with logistic regression

logreg = LogisticRegression()
logreg.fit(tfidf_train, y_train)
y_pred = logreg.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'LogisticRegression', label, results=results)

Accuracy:  0.9439667128987518
Precision:  0.9443510758208267
Recall:  0.9439667128987518
F1:  0.9439387007893646
[[3285  257]
 [ 147 3521]]


In [None]:
# with multinomial naive bayes

nb = MultinomialNB()
nb.fit(tfidf_train, y_train)
y_pred = nb.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'MultinomialNB', label=label, results=results)

Accuracy:  0.8690707350901525
Precision:  0.8690674737650315
Recall:  0.8690707350901525
F1:  0.8690680341744951
[[3066  476]
 [ 468 3200]]


In [None]:
#with KNN

knn = KNeighborsClassifier()
knn.fit(tfidf_train, y_train)
y_pred = knn.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'KNeighborsClassifier', label=label, results=results)

Accuracy:  0.6300970873786408
Precision:  0.750457903727056
Recall:  0.6300970873786408
F1:  0.5755096553135339
[[ 951 2591]
 [  76 3592]]


In [None]:
# with a random forest classifier

rf = RandomForestClassifier()
rf.fit(tfidf_train, y_train)
y_pred = rf.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'RandomForestClassifier', label=label, results=results)

Accuracy:  0.9360610263522885
Precision:  0.936470123059127
Recall:  0.9360610263522885
F1:  0.9360268995563436
[[3254  288]
 [ 173 3495]]


In [None]:
# with a support vector machine

svc = LinearSVC(random_state=0, dual='auto')
svc.fit(tfidf_train, y_train)
y_pred = svc.predict(tfidf_test)


results = calculate_and_save_metrics(y_test, y_pred, 'LinearSVC', label=label, results=results, last=True, savename='results_welfake.csv')

Accuracy:  0.9611650485436893
Precision:  0.9613885747423163
Recall:  0.9611650485436893
F1:  0.9611522961703536
[[3361  181]
 [  99 3569]]


In [None]:
# df liar using statement speaker and party

csv_path_liar_train = os.path.join( './', 'liar_dataset', 'train.tsv')
csv_path_liar_test = os.path.join( './', 'liar_dataset', 'test.tsv')

df_liar_train = pd.read_csv(csv_path_liar_train, sep='\t', header=None)
df_liar_test = pd.read_csv(csv_path_liar_test, sep='\t', header=None)


df_liar_train.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']
df_liar_test.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']


df_liar_train = df_liar_train[['label', 'statement', 'speaker', 'party']]
df_liar_test = df_liar_test[['label', 'statement', 'speaker', 'party']]
df_liar_train = df_liar_train.dropna()

# vectorize the statement column
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
tfidf_train = tfidf_vectorizer.fit_transform(df_liar_train['statement'])
tfidf_test = tfidf_vectorizer.transform(df_liar_test['statement'])

# vectorize the speaker column
tfidf_train_speaker = tfidf_vectorizer.fit_transform(df_liar_train['speaker'])
tfidf_test_speaker = tfidf_vectorizer.transform(df_liar_test['speaker'])

# vectorize the party column
tfidf_train_party = tfidf_vectorizer.fit_transform(df_liar_train['party'])
tfidf_test_party = tfidf_vectorizer.transform(df_liar_test['party'])

# concatenate the three vectors
tfidf_train = np.concatenate((tfidf_train.toarray(), tfidf_train_speaker.toarray(), tfidf_train_party.toarray()), axis=1)
tfidf_test = np.concatenate((tfidf_test.toarray(), tfidf_test_speaker.toarray(), tfidf_test_party.toarray()), axis=1)

label = ['true', 'mostly-true', 'half-true', 'barely-true', 'false', 'pants-fire']

In [None]:
# initialize a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=1000)
pac.fit(tfidf_train, df_liar_train['label'])

# predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'PassiveAggressiveClassifier', label, first=True)

Accuracy:  0.22888713496448304
Precision:  0.23023408526590852
Recall:  0.22888713496448304
F1:  0.22897047161020762
[[43 50 48 18 40  9]
 [52 52 63 32 38  4]
 [56 56 66 33 37 17]
 [29 36 47 41 41 18]
 [42 41 50 36 64 16]
 [ 8  8 16 12 24 24]]


In [None]:
# with logistic regression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(tfidf_train, df_liar_train['label'])
y_pred = logreg.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'LogisticRegression', label, results=results)

Accuracy:  0.2557221783741121
Precision:  0.2622683610451152
Recall:  0.2557221783741121
F1:  0.2548912547110119
[[60 50 43 15 37  3]
 [53 64 70 21 31  2]
 [40 67 68 46 43  1]
 [25 32 63 35 51  6]
 [43 37 48 32 77 12]
 [ 5  9 14 13 31 20]]


In [None]:
# with multinomial naive bayes

nb = MultinomialNB()
nb.fit(tfidf_train, df_liar_train['label'])
y_pred = nb.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'MultinomialNB', label=label, results=results)

Accuracy:  0.2549329123914759
Precision:  0.26958362898191
Recall:  0.2549329123914759
F1:  0.24034321415470822
[[ 26  70  60   8  42   2]
 [ 17  81 100  10  32   1]
 [ 19  67 102  28  49   0]
 [  9  40  79  20  60   4]
 [ 19  45  80  19  81   5]
 [  2  12  18  11  36  13]]


In [None]:
# with KNN

knn = KNeighborsClassifier()
knn.fit(tfidf_train, df_liar_train['label'])
y_pred = knn.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'KNeighborsClassifier', label=label, results=results)

Accuracy:  0.22257300710339384
Precision:  0.2314443848997401
Recall:  0.22257300710339384
F1:  0.22221364309019398
[[39 54 38 42 32  3]
 [31 50 66 50 43  1]
 [32 53 68 62 46  4]
 [12 30 35 61 64 10]
 [27 36 46 78 49 13]
 [ 3 10 13 22 29 15]]


In [None]:
# with a random forest classifier

rf = RandomForestClassifier()
rf.fit(tfidf_train, df_liar_train['label'])
y_pred = rf.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'RandomForestClassifier', label=label, results=results)

Accuracy:  0.2462509865824783
Precision:  0.25598791463936366
Recall:  0.2462509865824783
F1:  0.23947975224915122
[[34 68 43  9 50  4]
 [36 70 58 16 59  2]
 [31 83 63 23 64  1]
 [15 50 45 31 67  4]
 [23 42 52 23 98 11]
 [ 6  9 25  9 27 16]]


In [None]:
# with a support vector machine

svc = LinearSVC(random_state=0, dual='auto')
svc.fit(tfidf_train, df_liar_train['label'])
y_pred = svc.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'LinearSVC', label=label, results=results, last=True, savename='results_liar_speaker_party.csv')

Accuracy:  0.23993685872138912
Precision:  0.24102368113167535
Recall:  0.23993685872138912
F1:  0.2402004306580156
[[49 52 43 21 37  6]
 [56 57 58 34 30  6]
 [42 62 63 51 34 13]
 [31 33 49 41 42 16]
 [38 37 40 44 74 16]
 [ 7 17 15 12 21 20]]


In [None]:
# binary liar using statement speaker and party

df_liar_train['label'] = df_liar_train['label'].replace(['mostly-true'], 'true')
df_liar_test['label'] = df_liar_test['label'].replace(['mostly-true'], 'true')
df_liar_train['label'] = df_liar_train['label'].replace(['barely-true'], 'false')
df_liar_test['label'] = df_liar_test['label'].replace(['barely-true'], 'false')
df_liar_train['label'] = df_liar_train['label'].replace(['pants-fire'], 'false')
df_liar_test['label'] = df_liar_test['label'].replace(['pants-fire'], 'false')

# vectorize the statement column
tfidf_train = tfidf_vectorizer.fit_transform(df_liar_train['statement'])
tfidf_test = tfidf_vectorizer.transform(df_liar_test['statement'])

# vectorize the speaker column
tfidf_train_speaker = tfidf_vectorizer.fit_transform(df_liar_train['speaker'])
tfidf_test_speaker = tfidf_vectorizer.transform(df_liar_test['speaker'])

# vectorize the party column
tfidf_train_party = tfidf_vectorizer.fit_transform(df_liar_train['party'])
tfidf_test_party = tfidf_vectorizer.transform(df_liar_test['party'])

# concatenate the three vectors
tfidf_train = np.concatenate((tfidf_train.toarray(), tfidf_train_speaker.toarray(), tfidf_train_party.toarray()), axis=1)
tfidf_test = np.concatenate((tfidf_test.toarray(), tfidf_test_speaker.toarray(), tfidf_test_party.toarray()), axis=1)

label = ['true', 'false']

In [None]:
# initialize a PassiveAggressiveClassifier

pac = PassiveAggressiveClassifier(max_iter=1000)
pac.fit(tfidf_train, df_liar_train['label'])

# predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'PassiveAggressiveClassifier', label, first=True)

Accuracy:  0.4262036306235201
Precision:  0.43373585872217285
Recall:  0.4262036306235201
F1:  0.4295301446861111
[[195 142]
 [161 280]]


In [None]:
# with logistic regression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(tfidf_train, df_liar_train['label'])
y_pred = logreg.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'LogisticRegression', label, results=results)

Accuracy:  0.49171270718232046
Precision:  0.4555053197588091
Recall:  0.49171270718232046
F1:  0.4641482017531922
[[238 171]
 [155 361]]


In [None]:
# with multinomial naive bayes

nb = MultinomialNB()
nb.fit(tfidf_train, df_liar_train['label'])
y_pred = nb.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'MultinomialNB', label=label, results=results)

Accuracy:  0.5193370165745856
Precision:  0.4799922721754087
Recall:  0.5193370165745856
F1:  0.46245516911100526
[[249 196]
 [144 405]]


In [None]:
# with KNN

knn = KNeighborsClassifier()
knn.fit(tfidf_train, df_liar_train['label'])
y_pred = knn.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'KNeighborsClassifier', label=label, results=results)

Accuracy:  0.48697711128650356
Precision:  0.46545565923182264
Recall:  0.48697711128650356
F1:  0.47018379312967296
[[188 187]
 [112 381]]


In [None]:
# with a random forest classifier

rf = RandomForestClassifier()
rf.fit(tfidf_train, df_liar_train['label'])
y_pred = rf.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'RandomForestClassifier', label=label, results=results)

Accuracy:  0.5035516969218626
Precision:  0.4672927839945014
Recall:  0.5035516969218626
F1:  0.44938154763518495
[[236 207]
 [152 397]]


In [None]:
# with a support vector machine

svc = LinearSVC(random_state=0, dual='auto')
svc.fit(tfidf_train, df_liar_train['label'])
y_pred = svc.predict(tfidf_test)


results = calculate_and_save_metrics(df_liar_test['label'], y_pred, 'LinearSVC', label=label, results=results, last=True, savename='results_liar_speaker_party_binary.csv')

Accuracy:  0.4585635359116022
Precision:  0.4426506927816209
Recall:  0.4585635359116022
F1:  0.4486165963853383
[[223 156]
 [165 316]]
