# Evaluation of the different approaches to Naive Bayes

## Imports

In [2]:
# read files
import pickle

# preprocessing, math
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk import ngrams
from sklearn.model_selection import train_test_split

# evaluation
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.utils import class_weight

## Load results of the different approaches

In [5]:
pathToDataFiles = './datafiles/'
dataset = 'True.csv' # Fake.csv
n = 2

df_results_occ = pd.read_csv(pathToDataFiles + 'naive-bayes-occurences-results-' + dataset, sep='\t')
df_results_ng = pd.read_csv(pathToDataFiles + 'naive-bayes-N-Grams-' + str(n) + '-results-' + dataset, sep='\t')

## Evaluate the predictions

Normal Accuracy

In [7]:
counts = pd.Series(df_results_occ.label == 0).value_counts() # always guess the most common class
acc = counts[1]/counts.sum()    # (TP + TN) / ALL
print('baseline accuracy of always predicting the most common class:', acc)

counts = df_results_occ.TPTN.value_counts()
acc = counts[1]/counts.sum()
print('accuracy for NB with occurences count:', acc)

counts = df_results_ng.TPTN.value_counts()
acc = counts[1]/counts.sum()
print('accuracy for NB with N-Grams of ' + str(n) + ' count:', acc)

baseline accuracy of always predicting the most common class: 0.5092038145930361
accuracy for NB with occurences count: 0.8298957640275005
accuracy for NB with N-Grams of 2 count: 0.8582834331337326


Weighted Accuracy

In [10]:
from sklearn.metrics import balanced_accuracy_score

# baseline, just guess one class all the time
baseline_weighted_accuracy = balanced_accuracy_score(df_results_occ.label, [0 for l in df_results_occ.label])

# Naive Bayes with count of word occurences
occ_weighted_accuracy = balanced_accuracy_score(df_results_occ.label, df_results_occ.prediction)                                           

# Naive Bayes with N-grams
ng_weighted_accuracy = balanced_accuracy_score(df_results_ng.label, df_results_ng.prediction)


print('baseline_weighted_accuracy=%.3f' %baseline_weighted_accuracy)
print('NB with word occurences weighted accuracy=%.3f' %occ_weighted_accuracy)
print('NB with N-Grams of ' + str(n) + ' weighted accuracy=%.3f' %ng_weighted_accuracy)

baseline_weighted_accuracy=0.500
NB with word occurences weighted accuracy=0.831
NB with N-Grams of 2 weighted accuracy=0.856
