# <b><i><center>Document Classification Using Naïve Bayesian Classifier</center></i></b>

## Import necessary libraries 

In [137]:
# Convert a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer

# Transform a count matrix to a normalized tf or tf-idf representation
from sklearn.feature_extraction.text import TfidfTransformer

# Naive Bayes classifier for multinomial models
from sklearn.naive_bayes import MultinomialNB

# Pipeline of transforms with a final estimator
from sklearn.pipeline import Pipeline

# NumPy is the fundamental package for scientific computing with Python
import numpy as np

#The sklearn.metrics module includes score functions, performance metrics, and pairwise metrics and distance computations
from sklearn import metrics

# Easy-to-use data structures and data analysis tools for the Python programming language.
import pandas as pd

# Print data structures without summarization
import sys
np.set_printoptions(threshold=sys.maxsize)

## Prepare data

In [138]:
# Columns names for dataset
columns_names = ['data','target_names']

# Import dataset from a .csv file to pandas dataframe
dataset = pd.read_csv("../DATASETS/lab6.csv", names=columns_names)

# Print dataset
dataset

Unnamed: 0,data,target_names
0,love this sandwich,pos
1,This is an amazing place,pos
2,I feel very good about these beers,pos
3,This is my best work,pos
4,What an awesome view,pos
5,I do not like this restaurant,neg
6,I am tired of this stuff,neg
7,I can't deal with this,neg
8,He is my sworn enemy,neg
9,My boss is horrible,neg


In [139]:
# Replace all the 'pos' values with 1 and 'neg' values with 0 in the 'target_names' column and save it as a new 
# dataframe 'dataset_new'
dataset_new = dataset.replace({
    'target_names' : {'pos':1, 'neg':0}
})

In [140]:
# Add the changed 'target_names' column as 'target' column in the main dataset
dataset['target'] = dataset_new['target_names']

# Print dataset
dataset

Unnamed: 0,data,target_names,target
0,love this sandwich,pos,1
1,This is an amazing place,pos,1
2,I feel very good about these beers,pos,1
3,This is my best work,pos,1
4,What an awesome view,pos,1
5,I do not like this restaurant,neg,0
6,I am tired of this stuff,neg,0
7,I can't deal with this,neg,0
8,He is my sworn enemy,neg,0
9,My boss is horrible,neg,0


## Pre-process data

In [141]:
# Convert a collection of text documents to a matrix of token counts
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset.data)

# Shape of the tocken count matrix
print(f'(Documents, Words) => {X_train_counts.shape}')

(Documents, Words) => (18, 56)


In [142]:
# Print tocken count matrix
X_train_counts.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [151]:
# Visualizing the output of CountVectorizer // *not required

cv_matrix = np.array(X_train_counts.toarray())

rows = [f'Document{n}' for n in range(1, cv_matrix.shape[0] + 1)]
column = count_vect.get_feature_names()

df = pd.DataFrame(data=cv_matrix,
                  index = rows,
                  columns=column)
df

Unnamed: 0,about,am,amazing,an,and,awesome,bad,beers,best,boss,...,today,tomorrow,very,view,we,went,what,will,with,work
Document1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Document2,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Document3,1,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
Document4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
Document5,0,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
Document6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Document7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Document8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Document9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Document10,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [154]:
print(f'The values of the matrix is the count of any given word in a document:\n')
print(f'For example in Document 7 "am" is repeated ones\n')
print(f'df["Document7"]["am"] => {df.loc["Document7"]["am"]}')

The values of the matrix is the count of any given word in a document:

For example in Document 7 "am" is repeated ones

df["Document7"]["am"] => 1
