# OMPHEMETSE MANGOPE 
Advanced Machine Learning                     
Text Classification Project                   
Due Date: 27 July 2020


In [162]:
import nltk
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

data = pd.read_excel('reviews.xlsx')
data.head()

Unnamed: 0,Id,Review,Label
0,0,good and interesting,5
1,1,"This class is very helpful to me. Currently, I...",5
2,2,like!Prof and TAs are helpful and the discussi...,5
3,3,Easy to follow and includes a lot basic and im...,5
4,4,Really nice teacher!I could got the point eazl...,4


# Naive Bayes model building

In [163]:
class NaiveBayes_classifer:
    
    def __init__(self):
        self.N_c = None
        self.π_c = None
        self.Njc = None 
        self.lk_word = None
        self.fitted = False
        
        
    def fit(self, x, y):
        M = np.c_[x, y]
        N = x.shape[0]
        self.N_c = np.array([M[y == c] for c in np.unique(y)])
        self.π_c = np.array([len(Nc_class) / N for Nc_class in self.N_c])
        self.Njc = np.array([njc.sum(axis=0) for njc in ([M[y == c] [:,:x.shape[1]] for c in np.unique(y)])])
        self.lk_word = (self.Njc + 1) / self.Njc.sum(axis=1).reshape(-1, 1)
        
        self.fitted = True
         
        return(self)
    
    def predict(self, x):
        
        assert self.fitted
    
        class_values = np.zeros(shape=(x.shape[0], self.π_c.shape[0]))

        for i,c in enumerate(x):
            
            word_exists = c.astype(bool)
            lk_words_present = self.lk_word[:, word_exists] ** c[word_exists]
            lk_message = (lk_words_present).prod(axis=1)
            class_values[i] = lk_message * self.π_c
        
             
        
        return (class_values.argmax(axis=1) +1)
    
    def score(self, y,y_i):
        n_correct = (y == y_i).sum()
        n_incorrect = y_i.size - n_correct
        incorrect_fraction = n_incorrect /(n_incorrect + n_correct)
        return incorrect_fraction

        #print(f'Number of correctly classified samples: {n_correct}')
        #print(f"Number of incorrectly classified samples: {n_incorrect}")
        #print(f'Classification accuracy is: {np.round(1-incorrect_fraction,2) * 100} %')

# Exploratory Analysis

In [164]:
data.dtypes

Id         int64
Review    object
Label      int64
dtype: object

In [165]:
data.shape

(107018, 3)

In [166]:
# Number of documents per class
data['Label'].value_counts()

5    79173
4    18054
3     5071
1     2469
2     2251
Name: Label, dtype: int64

In [167]:
#Number of missing values
data.isnull().sum()

Id        0
Review    5
Label     0
dtype: int64

The data has 5 missing reviews, therefore we are going to drop them

In [168]:
data=data.dropna() # documents where missing values exist are dropped 

In [169]:
data.isnull().sum() # Now the missing values are absent

Id        0
Review    0
Label     0
dtype: int64

# Text preprocessing 

In [170]:
# Tokenization aims at splitting the text input sequences in tokens, or single words/characters 

# Tokenization: word_tokenize is adopted from nltk lbrary
data['tokens'] = data.apply(lambda row:nltk.word_tokenize(row['Review']), axis=1)

# Converting the a sequence list into string type
data['tokens'] = data['tokens'].astype(str)

data.head(3)

Unnamed: 0,Id,Review,Label,tokens
0,0,good and interesting,5,"['good', 'and', 'interesting']"
1,1,"This class is very helpful to me. Currently, I...",5,"['This', 'class', 'is', 'very', 'helpful', 'to..."
2,2,like!Prof and TAs are helpful and the discussi...,5,"['like', '!', 'Prof', 'and', 'TAs', 'are', 'he..."


In [171]:
# nltk corpus has a list of words called stopwords which aims to remove words with less contribution,
# for making predictions
stop_words = nltk.corpus.stopwords.words("english") + list(string.punctuation)

X = []

for w in data['tokens']:
    if w not in stop_words:
        X.append(w)

In [172]:
# Stemming aims at removing and replacing suffixes of words to get to its root form, this is done 
# using some rules embedded on the stemmer. However, different stemmers exist, PorterStemmer() is one:

ps = nltk.stem.PorterStemmer()

X_ = []

for w in X:
    X_.append(ps.stem(w))

In [173]:
# The computer might differentiate two words with the same meaning due to its lowercase and uppercase,
# words, here the lowercase is chosen as the standard for all words. 

words = []

for w in X_:
    words.append(w.lower())
    
    
data['tokens'] = words

# Data Transformation

In [174]:
data.head()

Unnamed: 0,Id,Review,Label,tokens
0,0,good and interesting,5,"['good', 'and', 'interesting']"
1,1,"This class is very helpful to me. Currently, I...",5,"['this', 'class', 'is', 'very', 'helpful', 'to..."
2,2,like!Prof and TAs are helpful and the discussi...,5,"['like', '!', 'prof', 'and', 'tas', 'are', 'he..."
3,3,Easy to follow and includes a lot basic and im...,5,"['easy', 'to', 'follow', 'and', 'includes', 'a..."
4,4,Really nice teacher!I could got the point eazl...,4,"['really', 'nice', 'teacher', '!', 'i', 'could..."


In [175]:
vec = MultiLabelBinarizer() 
Xd = vec.fit_transform(data["tokens"]) # Transforming features to 0 and 1
Yd = data['Label'] 
M = np.c_[Xd,Yd] # consolidating the 

In [176]:
Xd.shape # shape of reviews/tokens 

(107013, 144)

After utilizing the MultiLabelBinarizer we see that there are 107013 documents (5 removed due to missing reviews) and 144 unique features(unique words)  

In [177]:
vocab = Xd.shape[1] # total number of unique words in the dataset 
vocab

144

# Cross validation

In [178]:
X_train,X_test,y_train,y_test = train_test_split(Xd,Yd, test_size=0.30)

# Model training and prediction

In [179]:
NB = NaiveBayes_classifer()
fit = NB.fit(X_train, y_train)
y_pred = fit.predict(X_test)

# Model evaluation

In [218]:
n_correct = (y_test == y_pred).sum()
n_incorrect = y_test.size - n_correct
incorrect_fraction = n_incorrect /(n_incorrect + n_correct)

print(f'Number of correctly classified samples: {n_correct}')
print(f"Number of incorrectly classified samples: {n_incorrect}")
print(f'Classification accuracy is: {(1-incorrect_fraction) * 100} %')

Number of correctly classified samples: 23310
Number of incorrectly classified samples: 8794
Classification accuracy is: 72.60777473212062 %


# Prediction Cost Estimation

In [181]:
def loss(y,y_i):
    return(np.sum(y - np.log(y_i)))

cost = loss(y_test, y_pred)
print(f'The error rate for misclassification: {cost}')

The error rate for misclassification: 96934.47745455231


# Sklearn MultinomialNB

In [198]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [194]:
Nb = MultinomialNB()
fit = Nb.fit(X_train, y_train)
pre = fit.predict(X_test)

In [217]:
n_correct = (y_test == pre).sum()
n_incorrect = y_test.size - n_correct
incorrect_fraction = n_incorrect /(n_incorrect + n_correct)

print(f'Number of correctly classified samples: {n_correct}')
print(f"Number of incorrectly classified samples: {n_incorrect}")
print(f'Classification accuracy is: {(1-incorrect_fraction) * 100} %')

Number of correctly classified samples: 23333
Number of incorrectly classified samples: 8771
Classification accuracy is: 72.67941689509095 %


In [219]:
Cross_val = cross_val_score(Nb, Xd, Yd, cv=5)
Cross_val.mean()

0.7271453747682972

In [215]:
pd.DataFrame({
    'Folds':['fold 1','fold 2','fold 3','fold 4','fold 5'],
    'Accuracy':Cross_val})

Unnamed: 0,Folds,Accuracy
0,fold 1,0.726092
1,fold 2,0.729758
2,fold 3,0.724665
3,fold 4,0.725586
4,fold 5,0.729626
