## **Load Data**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re 
import nltk 


from sklearn import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import classification_report



In [18]:
import time

In [3]:
# Read cleaned data
train=pd.read_csv("/content/drive/MyDrive/ProjectAI/Data/Cleaned/clean_train.csv")
test=pd.read_csv("/content/drive/MyDrive/ProjectAI/Data/Cleaned/clean_test.csv")

In [4]:
train

Unnamed: 0,labels,text
0,1,menyrbie phil_gahan chrisitv
1,2,advice talk neighbours family exchange phone n...
2,2,coronavirus australia woolworths give elderly ...
3,2,food stock one empty please dont panic enough ...
4,0,ready go supermarket covid outbreak im paranoi...
...,...,...
41152,1,airline pilots offering stock supermarket shel...
41153,0,response complaint provided citing covid relat...
41154,2,know itâs getting tough kameronwilds rationing...
41155,1,wrong smell hand sanitizer starting turn coron...


In [5]:
test

Unnamed: 0,labels,text
0,0,trending new yorkers encounter empty supermark...
1,2,couldnt find hand sanitizer fred meyer turned ...
2,2,find protect loved ones coronavirus
3,0,panic buying hits newyork city anxious shopper...
4,1,toiletpaper dunnypaper coronavirus coronavirus...
...,...,...
3793,2,meanwhile supermarket israel people dance sing...
3794,0,panic buy lot nonperishable items echo needs f...
3795,1,asst prof economics cconces nbcphiladelphia ta...
3796,0,gov need somethings instead biar je rakyat ass...


In [6]:
# Replace Nan by ''
train.fillna('', inplace=True)
test.fillna('', inplace=True)

## **Extract Features**

In [8]:
# Extract features using Bags of words
from sklearn.feature_extraction.text import CountVectorizer

# vectorize train set
vectorizer = CountVectorizer()
message = vectorizer.fit_transform(train['text'])
message.shape

(41157, 59600)

In [9]:
X_train = message
y_train = train["labels"]

In [None]:
X_train

<41157x59600 sparse matrix of type '<class 'numpy.int64'>'
	with 709818 stored elements in Compressed Sparse Row format>

In [None]:
y_train

0        1
1        2
2        2
3        2
4        0
        ..
41152    1
41153    0
41154    2
41155    1
41156    0
Name: labels, Length: 41157, dtype: int64

In [10]:
#vectorize test set

message2 = vectorizer.transform(test['text'])
message2.shape

(3798, 59600)

In [11]:
X_test= message2 
y_test = test['labels']

## **Naive Bayes using sklearn library**




In [22]:
# Pick model
model= MultinomialNB()

start = time.time()
# train model
classifier= model.fit(X_train,y_train)

end = time.time()

In [13]:
train.text[0]

'menyrbie phil_gahan chrisitv   '

In [14]:
print(classifier.predict(X_train))
print(y_train.values)

[1 2 2 ... 2 2 0]
[1 2 2 ... 2 1 0]


In [15]:
# Result of train set

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(y_train, pred))
print("Accuracy: \n", accuracy_score(y_train, pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84     15398
           1       0.91      0.50      0.64      7713
           2       0.80      0.90      0.85     18046

    accuracy                           0.81     41157
   macro avg       0.84      0.76      0.78     41157
weighted avg       0.82      0.81      0.81     41157


Confusion Matrix: 
 [[13374   212  1812]
 [ 1578  3845  2290]
 [ 1541   183 16322]]
Accuracy: 
 0.8149524989673689


In [16]:
print(classifier.predict(X_test))
print(y_test)

[0 2 2 ... 1 0 2]
0       0
1       2
2       2
3       0
4       1
       ..
3793    2
3794    0
3795    1
3796    0
3797    2
Name: labels, Length: 3798, dtype: int64


In [24]:
# Accuracy of test set
pred = classifier.predict(X_test)
print(classification_report(y_test, pred))
print()
print("Accuracy: \n", accuracy_score(y_test, pred))
print('Time Train', end - start)

              precision    recall  f1-score   support

           0       0.69      0.76      0.73      1633
           1       0.66      0.14      0.23       619
           2       0.65      0.79      0.71      1546

    accuracy                           0.67      3798
   macro avg       0.67      0.56      0.56      3798
weighted avg       0.67      0.67      0.64      3798


Accuracy: 
 0.6700895208004213
Time Train 0.03255772590637207


In [None]:
# Predict a random comment
dict1=[]
dict1.append("I love you")
message3= vectorizer.transform(dict1)
pred= classifier.predict(message3)

if (pred== [1]):
    print ("Neutral")
elif (pred== [0]):
    print ("Negative")
else:
    print ("Positive")

Positive


##**Implement NaiveBayes code**


In [25]:
import warnings

from abc import ABCMeta, abstractmethod


import numpy as np
from scipy.special import logsumexp

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import binarize
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import label_binarize
from sklearn.utils import deprecated
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.utils.multiclass import _check_partial_fit_first_call
from sklearn.utils.validation import check_is_fitted, check_non_negative
from sklearn.utils.validation import _check_sample_weight





In [None]:
np.zeros((3,59600), dtype=np.float64)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
labelbin = LabelBinarizer()
Y= labelbin.fit_transform(y_train)
Y

array([[0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       ...,
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0]])

In [None]:
classes_ = labelbin.classes_
classes_

array([0, 1, 2])

In [None]:
X_train

<41157x59600 sparse matrix of type '<class 'numpy.int64'>'
	with 709818 stored elements in Compressed Sparse Row format>

In [None]:
cc = Y.sum(axis=0)
cc

array([15398,  7713, 18046])

In [None]:
fc= safe_sparse_dot(Y.T, X_train)

In [None]:
(fc +1 )/ (fc.sum(axis =1).reshape(-1,1))

array([[2.07139405e-05, 1.38092937e-05, 3.45232341e-06, ...,
        6.90464683e-06, 3.45232341e-06, 3.45232341e-06],
       [4.64106039e-05, 3.71284831e-05, 3.71284831e-05, ...,
        9.28212078e-06, 1.85642416e-05, 1.85642416e-05],
       [5.81500154e-06, 5.81500154e-06, 8.72250231e-06, ...,
        2.90750077e-06, 2.90750077e-06, 2.90750077e-06]])

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [26]:
class MultinomialNB1():

    def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None):
        self.alpha = alpha
        self.fit_prior = fit_prior
        self.class_prior = class_prior

    def _init_counters(self, n_classes, n_features):
        self.class_count_ = np.zeros(n_classes, dtype=np.float64)
        self.feature_count_ = np.zeros((n_classes, n_features), dtype=np.float64)
      
    #Count feature occurrences
    def _count(self, X, Y):
        self.feature_count_ += safe_sparse_dot(Y.T, X)
        self.class_count_ += Y.sum(axis=0)

    # Features probability
    def _update_feature_log_prob(self, alpha):
        smoothed_fc = self.feature_count_ + alpha
        smoothed_cc = smoothed_fc.sum(axis=1)
        self.feature_log_prob_ = np.log(smoothed_fc) - np.log(smoothed_cc.reshape(-1, 1) )

    def _joint_log_likelihood(self, X):
        # Calculate the posterior log probability of the samples X
        return safe_sparse_dot(X, self.feature_log_prob_.T) + self.class_log_prior_ 
      
    def predict(self, X):
        jll = self._joint_log_likelihood(X)
        return self.classes_[np.argmax(jll, axis=1)]

    # Classes probability
    def _update_class_log_prior(self, class_prior=None):
  
        n_classes = len(self.classes_) # 3
        '''
        if class_prior is not None:
            self.class_log_prior_ = np.log(class_prior)
        elif self.fit_prior:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                '''
        log_class_count = np.log(self.class_count_)

            # empirical prior, with sample_weight taken into account
        self.class_log_prior_ = log_class_count - np.log(self.class_count_.sum())
        '''
        else:
            
            self.class_log_prior_ = np.full(n_classes, -np.log(n_classes))
        '''
    
    #Fit Naive Bayes classifier according to X, y.
    def fit(self, X, y, sample_weight=None):
        _, n_features = X.shape
        
        # create one hot vector for labels
        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_

        if Y.shape[1] == 1:
            if len(self.classes_) == 2:
                Y = np.concatenate((1 - Y, Y), axis=1)
            else:  # degenerate case: just one class
                Y = np.ones_like(Y)
        
        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        n_classes = Y.shape[1]
        self._init_counters(n_classes, n_features)
        self._count(X, Y)
        alpha = self.alpha
        self._update_feature_log_prob(alpha)
        self._update_class_log_prior(class_prior=class_prior)
        return self

    '''
    @property
    def n_features_(self):
        return self.n_features_in_
  '''

In [30]:
# Train model
models= MultinomialNB1()
start= time.time()
classif= models.fit(X_train,y_train)
end= time.time()

In [None]:
# Result of train set

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred1 = classif.predict(X_train)
print(classification_report(y_train, pred1))
print()
print("Confusion Matrix: \n", confusion_matrix(y_train, pred1))
print("Accuracy: \n", accuracy_score(y_train, pred1))


              precision    recall  f1-score   support

           0       0.81      0.87      0.84     15398
           1       0.91      0.50      0.64      7713
           2       0.80      0.90      0.85     18046

    accuracy                           0.81     41157
   macro avg       0.84      0.76      0.78     41157
weighted avg       0.82      0.81      0.81     41157


Confusion Matrix: 
 [[13374   212  1812]
 [ 1578  3845  2290]
 [ 1541   183 16322]]
Accuracy: 
 0.8149524989673689


In [28]:
pred1= classif.predict(X_test)

In [31]:
# Accuracy of test set

print(classification_report(y_test, pred1))
print()

print("Accuracy: \n", accuracy_score(y_test, pred1))
print('Time Train', end - start)

              precision    recall  f1-score   support

           0       0.69      0.76      0.73      1633
           1       0.66      0.14      0.23       619
           2       0.65      0.79      0.71      1546

    accuracy                           0.67      3798
   macro avg       0.67      0.56      0.56      3798
weighted avg       0.67      0.67      0.64      3798


Accuracy: 
 0.6700895208004213
Time Train 0.0333251953125


In [None]:
# Predict a random comment
dict1=[]
dict1.append("I'm tired ")
message3= vectorizer.transform(dict1)
pred= classifier.predict(message3)

if (pred== [1]):
    print ("Neutral")
elif (pred== [0]):
    print ("Negative")
else:
    print ("Positive")

Negative
