# <b><i><center>Document Classification Using Naïve Bayesian Classifier</center></i></b>

## Import necessary libraries 

In [287]:
# Convert a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer

# Transform a count matrix to a normalized tf or tf-idf representation
from sklearn.feature_extraction.text import TfidfTransformer

# Naive Bayes classifier for multinomial models
from sklearn.naive_bayes import MultinomialNB

# Pipeline of transforms with a final estimator
from sklearn.pipeline import Pipeline

# NumPy is the fundamental package for scientific computing with Python
import numpy as np

#The sklearn.metrics module includes score functions, performance metrics, and pairwise metrics and distance computations
from sklearn import metrics

# Easy-to-use data structures and data analysis tools for the Python programming language.
import pandas as pd

# Print data structures without summarization
import sys
np.set_printoptions(threshold=sys.maxsize)

# floor function
from math import floor

## Prepare data

In [288]:
# Columns names for dataset
columns_names = ['data','target_names']

# Import dataset from a .csv file to pandas dataframe
dataset = pd.read_csv("../DATASETS/lab6.csv", names=columns_names)

# Print dataset
dataset

Unnamed: 0,data,target_names
0,love this sandwich,pos
1,This is an amazing place,pos
2,I feel very good about these beers,pos
3,This is my best work,pos
4,What an awesome view,pos
5,I do not like this restaurant,neg
6,I am tired of this stuff,neg
7,I can't deal with this,neg
8,He is my sworn enemy,neg
9,My boss is horrible,neg


In [289]:
# Replace all the 'pos' values with 1 and 'neg' values with 0 in the 'target_names' column and save it as a new 
# dataframe 'dataset_new'
dataset_new = dataset.replace({
    'target_names' : {'pos':1, 'neg':0}
})

In [290]:
# Add the changed 'target_names' column as 'target' column in the main dataset
dataset['target'] = dataset_new['target_names']

# No.of classes in the dataset
classes = list(set(dataset['target_names']))

# Print dataset
dataset

Unnamed: 0,data,target_names,target
0,love this sandwich,pos,1
1,This is an amazing place,pos,1
2,I feel very good about these beers,pos,1
3,This is my best work,pos,1
4,What an awesome view,pos,1
5,I do not like this restaurant,neg,0
6,I am tired of this stuff,neg,0
7,I can't deal with this,neg,0
8,He is my sworn enemy,neg,0
9,My boss is horrible,neg,0


In [291]:
# Split data into train data and test data
def train_test_split(dataset, train_split):
    dataset_length = len(dataset)
    train_split = train_split/100.0
    split_percentage = floor(dataset_length * train_split)
    return dataset.data[0:split_percentage],dataset.data[split_percentage:],dataset.target[0:split_percentage],dataset.target[split_percentage:]

train_split_percentage = 60
X_train, X_test, y_train, y_test = train_test_split(dataset, train_split_percentage)
print(X_train)
print()
print(y_train)

0                    love this sandwich
1              This is an amazing place
2    I feel very good about these beers
3                  This is my best work
4                  What an awesome view
5         I do not like this restaurant
6              I am tired of this stuff
7                I can't deal with this
8                  He is my sworn enemy
9                   My boss is horrible
Name: data, dtype: object

0    1
1    1
2    1
3    1
4    1
5    0
6    0
7    0
8    0
9    0
Name: target, dtype: int64


## Pre-process data

### CountVectorizer
Convert a collection of text documents to a matrix of token counts


In [292]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

# Shape of the tocken count matrix
print(f'(Documents, Words) => {X_train_counts.shape}')

(Documents, Words) => (10, 35)


In [293]:
# Print tocken count matrix
X_train_counts.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0

In [294]:
# Visualizing the output of CountVectorizer // *not required

cv_matrix = np.array(X_train_counts.toarray())

rows = [f'Document{n}' for n in range(1, cv_matrix.shape[0] + 1)]
print("Rows:\n",rows)

column = count_vect.get_feature_names()
print("Columns:\n",column)

df = pd.DataFrame(data=cv_matrix,
                  index = rows,
                  columns=column)
df

Rows:
 ['Document1', 'Document2', 'Document3', 'Document4', 'Document5', 'Document6', 'Document7', 'Document8', 'Document9', 'Document10']
Columns:
 ['about', 'am', 'amazing', 'an', 'awesome', 'beers', 'best', 'boss', 'can', 'deal', 'do', 'enemy', 'feel', 'good', 'he', 'horrible', 'is', 'like', 'love', 'my', 'not', 'of', 'place', 'restaurant', 'sandwich', 'stuff', 'sworn', 'these', 'this', 'tired', 'very', 'view', 'what', 'with', 'work']


Unnamed: 0,about,am,amazing,an,awesome,beers,best,boss,can,deal,...,stuff,sworn,these,this,tired,very,view,what,with,work
Document1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Document2,0,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Document3,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
Document4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
Document5,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
Document6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Document7,0,1,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0
Document8,0,0,0,0,0,0,0,0,1,1,...,0,0,0,1,0,0,0,0,1,0
Document9,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
Document10,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [295]:
# Visualizing the output of CountVectorizer // *not required
print(f'The values of the matrix is the count of any given word w in any given document n:\n')
print(f'For example in Document 7 the word "am" is repeated ones\n')
print(f'df["Document7"]["am"] => {df.loc["Document7"]["am"]}')

The values of the matrix is the count of any given word w in any given document n:

For example in Document 7 the word "am" is repeated ones

df["Document7"]["am"] => 1


In [296]:
# Visualizing the output of CountVectorizer // *not required
print(f"\nFrequency of the word 'an' in the whole set of documents: {count_vect.vocabulary_.get('an')}")


Frequency of the word 'an' in the whole set of documents: 3


### TfidfTransformer

Typically, the tf-idf weight is composed by two terms:<br>
The first computes the normalized Term Frequency (TF), aka. the number of times a word appears in a document, divided by the total number of words in that document.<br>
The second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.
<br><br>
TF: Term Frequency, which measures how frequently a term occurs in a document.<br>
TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
<br><br>
IDF: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance.<br>
Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:
<br><br>
IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

In [297]:
# Perform TdIdf Transform of the data
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Print TdIdf matrix
X_train_tfidf.toarray()

(10, 35)


array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.6610807 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.6610807 ,
        0.        , 0.        , 0.        , 0.35488678, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.53853307, 0.45780231, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.35609358, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.53853307, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.28909975, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.40824829, 0.        , 0.        , 0.

## Prepare Model

### Multinomial Naïve-Bayesian Model
<a href="https://medium.com/syncedreview/applying-multinomial-naive-bayes-to-nlp-problems-a-practical-explanation-4f5271768ebf" >Detailed explanation on Multinomial Naïve-Bayesian Classifier</a>

In [298]:
model = MultinomialNB()

# Train model with the vectorized matrix and target 
# A single training example would look something like this

#     X = [love this sandwich]
#     X = TfidfTransformer( CountVectorizer( X ) )
    
#     y = ['pos']
#     y = numerically_encode_class( y )
    
#     Training pair => (X, y)
model.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Predicting the Outcome

In [299]:
# Sample documents to test our model
docs_new = ['love this sandwich	', 'I do not like this restauran']

# Perform countvectorization and Tfidf transform on the sample documents
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

# Feed sample document to model to get prediction
predicted = model.predict(X_new_tfidf)

# Print predictions
for doc, category in zip(docs_new, predicted):
    print(f'{doc} => {classes[category]}')


love this sandwich	 => pos
I do not like this restauran => neg


### compute performance of model with test data

__[Precision, Recall & Accuracy](https://medium.com/@shivangisareen/precision-recall-accuracy-6a214187f059)__<br>
__[Confusion matrix and other metrics in machine learning](https://medium.com/hugo-ferreiras-blog/confusion-matrix-and-other-metrics-in-machine-learning-894688cb1c0a)__

In [300]:
# Perform countvectorization and Tfidf transform on the sample documents
X_test_cv = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_cv)

# Predict outcome
result = model.predict(X_test_tfidf)

In [301]:
# Compute accuracy
accuracy = np.mean(result == np.array(y_test))
print(f'Accuracy: {accuracy*100}%')

Accuracy: 87.5%


In [302]:
# Compute precision and recall values
print(metrics.classification_report(y_test, result,
                                    target_names=classes))

              precision    recall  f1-score   support

         neg       1.00      0.75      0.86         4
         pos       0.80      1.00      0.89         4

    accuracy                           0.88         8
   macro avg       0.90      0.88      0.87         8
weighted avg       0.90      0.88      0.87         8

