# Naive Bayes with TF-IDF

## Imports

In [14]:
# read files
import json
import urllib.request 
import re, os
import sys
import pickle

# preprocessing, math
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk import ngrams
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# multiprocessing
from multiprocessing import Pool
from functools import partial

# helper functions
from helperFunctions import *

# naive bayes implementation
from naiveBayes import *

# evaluation
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.utils import class_weight

## Load preprocessed dataset

In [3]:
pathToDataFiles = './datafiles/'
dataset = 'True.csv' # Fake.csv
df = pd.read_csv(pathToDataFiles + 'dataset_preprocessed-' + dataset, sep='\t')

## Create train, validation, test split

In [4]:
# train val test relation -> 60:20:20
trainval, test = train_test_split(df, test_size=0.2, random_state=12345)
train, val = train_test_split(trainval, test_size=0.25, random_state=12345)
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
train.head()

Unnamed: 0,text,label
0,russian nuclear bombers fly near north korea r...,1
1,japanese man kills wife priestess sister sword...,1
2,tokyo governor quits head conservative opposit...,1
3,top international lawyers say hong kong rule l...,1
4,spain rule exceptional measures catalonia madr...,1


prep validation set

In [5]:
y_val = val.label
val.drop('label', axis=1, inplace=True, errors='ignore')

# expected text format is list of words per document
val.text = val.text.apply(lambda x: x.split(' '))

## Naive Bayes with TF-IDF

In [6]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 6000)

In [12]:
feature_vec = vectorizer.fit_transform(train.text)

In [13]:
tf_transformer = TfidfTransformer(use_idf=True).fit(feature_vec)
train_tf = tf_transformer.transform(feature_vec)
train_tf.shape

(13526, 6000)

## TODO: Adjust to own version

In [17]:

text_features = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
])

text_features.fit_transform(train.text)

<13526x55882 sparse matrix of type '<class 'numpy.float64'>'
	with 2228135 stored elements in Compressed Sparse Row format>

## Apply Naive Bayes with TF-IDF

In [36]:
from sklearn.naive_bayes import MultinomialNB


model = MultinomialNB().fit(train_tf, train.label)

In [None]:
model.predict(val)

In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, log_loss

pipe = Pipeline([
    ('features', text_features),
    ('clf', MultinomialNB()),
])

pipe.fit(train.text, train.label)

txt = [' '.join(w) for w in val.text]

nb_pred = pipe.predict(train_tf)
nb_probs = pipe.predict_proba(train_tf)

print("Accuracy score: " + str(accuracy_score(y_val, nb_pred)))
print("Log loss: " + str(log_loss(y_val, nb_probs)))

AttributeError: lower not found