# Naive Bayes counting N-Grams in documents

## Imports

In [1]:
# read files
import json
import urllib.request 
import re, os
import sys
import pickle

# preprocessing, math
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk import ngrams
from sklearn.model_selection import train_test_split

# multiprocessing
from multiprocessing import Pool
from functools import partial

# helper functions
from helperFunctions import *

# naive bayes implementation
from naiveBayes import *

## Load preprocessed dataset

In [2]:
pathToDataFiles = './datafiles/'
dataset = 'True.csv' # Fake.csv
df = pd.read_csv(pathToDataFiles + 'dataset_preprocessed-' + dataset, sep='\t')

# N-Grams of length n
n = 2

## Create train, validation, test split

In [3]:
# train val test relation -> 60:20:20
trainval, test = train_test_split(df, test_size=0.2, random_state=12345)
train, val = train_test_split(trainval, test_size=0.25, random_state=12345)
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
train.head()

Unnamed: 0,text,label
0,russian nuclear bombers fly near north korea r...,1
1,japanese man kills wife priestess sister sword...,1
2,tokyo governor quits head conservative opposit...,1
3,top international lawyers say hong kong rule l...,1
4,spain rule exceptional measures catalonia madr...,1


prep validation set

In [4]:
y_val = val.label
val.drop('label', axis=1, inplace=True, errors='ignore')

# Create n-grams in the validation set
val['ngrams'] = val.text.apply(createNgrams, args=[n])

# Naive Bayes with counting N-Grams

Create n-grams

In [5]:
# initial tests were made with n = 6, 
# but the chances of 6 words being repeated in the same order are very slim
# so I gradually reduced the size of the ngrams

train['ngrams'] = train.text.apply(createNgrams, args=[n])

create frequency table

In [6]:
# ~down to quadratic complexity
#freq_tb_ng, uniques_ng = frequencyTableNgrams(train.ngrams, train.label)

In [7]:
""" np.save(pathToDataFiles + 'freq_tb-N-Grams-' + str(n) + '-' + dataset, freq_tb_ng)
with open(pathToDataFiles + 'uniques-N-Grams-' + str(n) + '-' + dataset + '.pkl', 'wb') as f:
    pickle.dump(uniques_ng, f) """

" np.save(pathToDataFiles + 'freq_tb-N-Grams-' + str(n) + '-' + dataset, freq_tb_ng)\nwith open(pathToDataFiles + 'uniques-N-Grams-' + str(n) + '-' + dataset + '.pkl', 'wb') as f:\n    pickle.dump(uniques_ng, f) "

In [8]:
with open(pathToDataFiles + 'freq_tb-N-Grams-' + str(n) + '-' + dataset + '.npy', 'rb') as f:
    freq_tb_ng = np.load(f)
      
with open(pathToDataFiles + 'uniques-N-Grams-' + str(n) + '-' + dataset + '.pkl', 'rb') as f:
    uniques_ng = pickle.load(f)

create likelihood table

In [9]:
# no need to merge all information into one table
sumRowsRel, sumColsRel = likelihoodTable(freq_tb_ng)  

## Apply Naive Bayes

run prediction for the validation set

predict in batches to reduce ram usage and allow for splitted over night jobs

In [11]:
start = 2000
stop = len(val.ngrams)
stepSize = 10
subfolderPredictions = 'predictions/'
for i in range(start, stop, stepSize):
    print('predicting documents from', i, 'to', i+stepSize)
    y_pred = []
    # got 4 theoretical cores and need to study -> 3
    with Pool(3) as p:
        y_pred.extend(
            p.map(
                partial(predictDoc, uniques=uniques_ng, freq_tb= freq_tb_ng, sumRowsRel=sumRowsRel, sumColsRel=sumColsRel, nclasses=2), 
                val.ngrams[i:i+stepSize]))
        print('this iterations predictions:', y_pred)
    dfPartialResults = pd.DataFrame(pd.Series(data=y_pred, name='prediction'))
    dfPartialResults.to_csv(pathToDataFiles + subfolderPredictions + 'naive-bayes-N-Grams-' + str(n) + '-predictions-' + str(i) + '-' + dataset, sep='\t', index=False)
    

predicting documents from 2000 to 2010
this iterations predictions: [0, 0, 1, 0, 0, 0, 0, 0, 1, 1]
predicting documents from 2010 to 2020
this iterations predictions: [0, 0, 0, 0, 1, 0, 1, 1, 1, 0]
predicting documents from 2020 to 2030
this iterations predictions: [1, 0, 0, 0, 0, 1, 1, 0, 1, 0]
predicting documents from 2030 to 2040
this iterations predictions: [1, 0, 0, 0, 1, 1, 0, 1, 1, 0]
predicting documents from 2040 to 2050
this iterations predictions: [0, 0, 1, 1, 0, 1, 0, 1, 0, 0]
predicting documents from 2050 to 2060
this iterations predictions: [1, 0, 0, 1, 1, 0, 1, 0, 0, 1]
predicting documents from 2060 to 2070
this iterations predictions: [1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
predicting documents from 2070 to 2080
this iterations predictions: [1, 0, 0, 1, 0, 0, 0, 1, 1, 0]
predicting documents from 2080 to 2090
this iterations predictions: [0, 1, 0, 0, 1, 0, 0, 1, 1, 0]
predicting documents from 2090 to 2100
this iterations predictions: [1, 0, 1, 0, 1, 1, 1, 0, 1, 1]
predicting

In [13]:
df_predictions = pd.DataFrame(pd.Series(name='prediction', dtype='int64'))
for i in range(0, len(val), stepSize):
   dfPartialResults = pd.read_csv(pathToDataFiles + subfolderPredictions + 'naive-bayes-N-Grams-' + str(n) + '-predictions-' + str(i) + '-' + dataset, sep='\t')
   df_predictions = pd.concat([df_predictions, dfPartialResults], ignore_index=True)

In [14]:
df_results_ng = pd.DataFrame(val.ngrams)
df_results_ng['label'] = y_val
df_results_ng['prediction'] = df_predictions.prediction
df_results_ng['TPTN'] = df_results_ng.label == df_results_ng.prediction
df_results_ng.head()

Unnamed: 0,ngrams,label,prediction,TPTN
0,"[(trump, says), (says, russia), (russia, probe...",0,0,True
1,"[(trudeau, sees), (sees, flood), (flood, ameri...",0,0,True
2,"[(lawsuit, filed), (filed, baltimore), (baltim...",0,1,False
3,"[(u.s., quits), (quits, talks), (talks, global...",1,0,False
4,"[(senate, 's), ('s, cia), (cia, torture), (tor...",0,0,True


persist

In [16]:
df_results_ng.to_csv(pathToDataFiles + 'naive-bayes-N-Grams-' + str(n) + '-results-' + dataset, sep='\t', index=False)