# Naive Bayes counting word occurences in documents

## Imports

In [1]:
# read files
import json
import urllib.request 
import re, os
import sys
import pickle

# preprocessing, math
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk import ngrams
from sklearn.model_selection import train_test_split

# multiprocessing
from multiprocessing import Pool
from functools import partial

# helper functions
from helperFunctions import *

# naive bayes implementation
from naiveBayes import *

# evaluation
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.utils import class_weight

## Load preprocessed dataset

In [2]:
pathToDataFiles = './datafiles/'
dataset = 'True.csv' # Fake.csv
df = pd.read_csv(pathToDataFiles + 'dataset_preprocessed-' + dataset, sep='\t')

## Create train, validation, test split

In [3]:
# train val test relation -> 60:20:20
trainval, test = train_test_split(df, test_size=0.2, random_state=12345)
train, val = train_test_split(trainval, test_size=0.25, random_state=12345)
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
train.head()

Unnamed: 0,text,label
0,russian nuclear bombers fly near north korea r...,1
1,japanese man kills wife priestess sister sword...,1
2,tokyo governor quits head conservative opposit...,1
3,top international lawyers say hong kong rule l...,1
4,spain rule exceptional measures catalonia madr...,1


prep validation set

In [4]:
y_val = val.label
val.drop('label', axis=1, inplace=True, errors='ignore')
val.text = val.text.apply(lambda x: x.split(' '))

# Naive Bayes with counting occurences

Create Frequency Table

In [5]:
# ~down to quadratic complexity
#freq_tb_occ, uniques_occ = frequencyTableOccurences(train.text, train.label)

In [11]:
""" np.save(pathToDataFiles + 'freq_tb-occurences-' + dataset, freq_tb_occ)
with open(pathToDataFiles + 'uniques-occurences-' + dataset + '.pkl', 'wb') as f:
    pickle.dump(uniques_occ, f) """

In [12]:
with open(pathToDataFiles + 'freq_tb-occurences-' + dataset + '.npy', 'rb') as f:
    freq_tb_occ = np.load(f)

with open(pathToDataFiles + 'uniques-occurences-' + dataset + '.pkl', 'rb') as f:
    uniques_occ = pickle.load(f)

Create likelyhood table

In [14]:
# no need to merge all information into one table
sumRowsRel, sumColsRel = likelihoodTable(freq_tb_occ)  

## Apply Naive Bayes

run prediction for the validation set

In [19]:
#val.text = val.text.apply(str.split)
with Pool(3) as p:
    y_pred = p.map(partial(predictDoc, uniques=uniques_occ, freq_tb= freq_tb_occ, sumRowsRel=sumRowsRel, sumColsRel=sumColsRel, nclasses=2) , val.text)

In [20]:
df_results_occ = pd.DataFrame(val.text)
df_results_occ['label'] = y_val
df_results_occ['prediction'] = y_pred
df_results_occ['TPTN'] = df_results_occ.label == df_results_occ.prediction
df_results_occ.head()

Unnamed: 0,text,label,prediction,TPTN
0,"[trump, says, russia, probe, reveal, collusion...",0,0,True
1,"[trudeau, sees, flood, americans, canada, trum...",0,0,True
2,"[lawsuit, filed, baltimore, primary, election,...",0,1,False
3,"[u.s., quits, talks, global, migration, pact, ...",1,0,False
4,"[senate, 's, cia, torture, report, go, obama, ...",0,0,True


presist

In [21]:
df_results_occ.to_csv(pathToDataFiles + 'naive-bayes-occurences-results-' + dataset, sep='\t', index=False)
df_results_occ = pd.read_csv(pathToDataFiles + 'naive-bayes-occurences-results-' + dataset, sep='\t')