# Naive Bayes with TF-IDF

## Imports

In [1]:
# read files
import json
import urllib.request 
import re, os
import sys
import pickle

# preprocessing, math
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk import ngrams
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# multiprocessing
from multiprocessing import Pool
from functools import partial

# helper functions
from helperFunctions import *

# naive bayes implementation
from naiveBayes import *

# evaluation
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.utils import class_weight

## Load preprocessed dataset

In [2]:
pathToDataFiles = './datafiles/'
dataset = 'True.csv' # Fake.csv
df = pd.read_csv(pathToDataFiles + 'dataset_preprocessed-' + dataset, sep='\t')

## Create train, validation, test split

In [3]:
# train val test relation -> 60:20:20
trainval, test = train_test_split(df, test_size=0.2, random_state=12345)
train, val = train_test_split(trainval, test_size=0.25, random_state=12345)
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
train.head()

Unnamed: 0,text,label
0,russian nuclear bombers fly near north korea r...,1
1,japanese man kills wife priestess sister sword...,1
2,tokyo governor quits head conservative opposit...,1
3,top international lawyers say hong kong rule l...,1
4,spain rule exceptional measures catalonia madr...,1


prep validation set

In [4]:
y_val = val.label

val.drop('label', axis=1, inplace=True, errors='ignore')

# expected text format is list of words per document
val.text = val.text.apply(lambda x: x.split(' '))

## Naive Bayes with TF-IDF

In [5]:
""" # feature vector consists of all possible words in all documents, and has values of number of counts in each document

#ctv_total = CountVectorizer()
ctv_c0 = CountVectorizer(analyzer='word')
ctv_c1 = CountVectorizer(analyzer='word')

#feature_vec_total = ctv_total.fit_transform(train.text)
feature_vec_c0 = ctv_c0.fit_transform(train[train.label == 0].text)
feature_vec_c1 = ctv_c1.fit_transform(train[train.label == 1].text) """

Create TF-IDF Table for each class

In [6]:
""" 
transformer_c0 = TfidfTransformer()
transformer_c1 = TfidfTransformer()

tf_c0 = transformer_c0.fit_transform(feature_vec_c0)
tf_c1 = transformer_c1.fit_transform(feature_vec_c1)

tf_dense_c0 = np.array(tf_c0.todense())
tf_dense_c1 = np.array(tf_c1.todense()) """

Frequency Table

In [34]:
""" vocabs = [ctv_c0.vocabulary_, ctv_c1.vocabulary_]
weights = [tf_dense_c0, tf_dense_c1]
freq_tf, uniques_tf = frequencyTableTFIDF(train.text, train.label, vocabs, weights) """

In [35]:
""" np.save(pathToDataFiles + 'freq_tb-TFIDF-' + dataset, freq_tf)
with open(pathToDataFiles + 'uniques-TFIDF-' + dataset + '.pkl', 'wb') as f:
    pickle.dump(uniques_tf, f) """

In [44]:
with open(pathToDataFiles + 'freq_tb-TFIDF-' + dataset + '.npy', 'rb') as f:
    freq_tf = np.load(f)

with open(pathToDataFiles + 'uniques-TFIDF-' + dataset + '.pkl', 'rb') as f:
    uniques_tf = pickle.load(f)

In [36]:
sumRowsRel, sumColsRel = likelihoodTable(freq_tf)

In [43]:
predictDoc(val.text[0], uniques_tf, freq_tf, sumRowsRel, sumColsRel)

TypeError: 'numpy.float64' object is not iterable

In [42]:
with Pool(3) as p:
    y_pred = p.map(partial(predictDoc, uniques=uniques_tf, freq_tb= freq_tf, sumRowsRel=sumRowsRel, sumColsRel=sumColsRel) , val.text[:2])

TypeError: 'numpy.float64' object is not iterable

In [None]:
y_pred

[-1, -1]

In [76]:
# word_count_in_class : sum of(tf-idf_weights of the word for all the documents belonging to that class) //basically replacing the counts with the tfidf weights of the same word calculated for every document within that class.
# -> TfidfTransformer.idf_ * tf of word -> sum for each class

sum_c0 = []
for doc in feature_vec_c0:
    sum_c0.append([a*b for a,b in zip(doc,transformer_c0.idf_)])

sum_c1 = []
for doc in feature_vec_c1:
    sum_c1.append(np.multiply(doc, transformer_c1.idf_))


KeyboardInterrupt: 

In [None]:
sum_c1

AttributeError: 'list' object has no attribute 'todense'

In [58]:
# P(word|class)=(word_count_in_class + 1)/(total_words_in_class+total_unique_words_in_all_classes
# (basically vocabulary of words in the entire training set))

p_w_c0 = (sum_c0 + 1)/(total_words_in_c0 + len_total)

In [59]:
p_w_c0

50.31129282088646

## TODO: Adjust to own version

In [27]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import SVC
from sklearn.naive_bayes import CategoricalNB

pipe = Pipeline([
    ('count', CountVectorizer()), 
    ('tfid', TfidfTransformer()),
    ('dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
    ('svc', CategoricalNB())]).fit(train.text, train.label)




In [77]:
pipe.score(val.text, y_val)



0.49079618540696385

In [19]:
pipe.score(val.text, y_val)

0.9547571523619428

In [23]:
pipe['tfid'].idf_.shape

(55882,)

## Apply Naive Bayes with TF-IDF

In [36]:
from sklearn.naive_bayes import MultinomialNB


model = MultinomialNB().fit(train_tf, train.label)

In [None]:
model.predict(val)

In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, log_loss

pipe = Pipeline([
    ('features', text_features),
    ('clf', MultinomialNB()),
])

pipe.fit(train.text, train.label)

txt = [' '.join(w) for w in val.text]

nb_pred = pipe.predict(train_tf)
nb_probs = pipe.predict_proba(train_tf)

print("Accuracy score: " + str(accuracy_score(y_val, nb_pred)))
print("Log loss: " + str(log_loss(y_val, nb_probs)))

AttributeError: lower not found