# OMPHEMETSE MANGOPE 
Advanced Machine Learning                     
Text Classification Project 



In [157]:
import nltk
import string
import pandas as pd
from itertools import chain
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_excel('reviews.xlsx')
data.head()

Unnamed: 0,Id,Review,Label
0,0,good and interesting,5
1,1,"This class is very helpful to me. Currently, I...",5
2,2,like!Prof and TAs are helpful and the discussi...,5
3,3,Easy to follow and includes a lot basic and im...,5
4,4,Really nice teacher!I could got the point eazl...,4


# Model building

In [158]:
def Naive_bayes(prior,likelihood):
    return(likelihood * prior)

# Exploratory Analysis

In [159]:
data.dtypes

Id         int64
Review    object
Label      int64
dtype: object

In [160]:
data.shape

(107018, 3)

In [161]:
 data['Label'].value_counts()

5    79173
4    18054
3     5071
1     2469
2     2251
Name: Label, dtype: int64

In [162]:
data.isnull().sum()

Id        0
Review    5
Label     0
dtype: int64

In [163]:
data=data.dropna()

In [164]:
data.isnull().sum()

Id        0
Review    0
Label     0
dtype: int64

# Text preprocessing 

In [165]:
# Tokenization aims at splitting the text input sequences in tokens, or single words/characters 

# Tokenization: word_tokenize is adopted from nltk lbrary
data['tokens'] = data.apply(lambda row:nltk.word_tokenize(row['Review']), axis=1)

# Converting the a sequence list into string type
data['tokens'] = data['tokens'].astype(str)

data.head(3)

Unnamed: 0,Id,Review,Label,tokens
0,0,good and interesting,5,"['good', 'and', 'interesting']"
1,1,"This class is very helpful to me. Currently, I...",5,"['This', 'class', 'is', 'very', 'helpful', 'to..."
2,2,like!Prof and TAs are helpful and the discussi...,5,"['like', '!', 'Prof', 'and', 'TAs', 'are', 'he..."


In [166]:
# nltk corpus has a list of words called stopwords which aims to remove words with less contribution,
# for making predictions
stop_words = nltk.corpus.stopwords.words("english") + list(string.punctuation)

X = []

for w in data['tokens']:
    if w not in stop_words:
        X.append(w)

In [167]:
# Stemming aims at removing and replacing suffixes of words to get to its root form, this is done 
# using some rules embedded on the stemmer. However, different stemmers exist, PorterStemmer() is one:

ps = nltk.stem.PorterStemmer()

X_ = []

for w in X:
    X_.append(ps.stem(w))

In [168]:
# The computer might differentiate two words with the same meaning due to its lowercase and uppercase,
# words, here the lowercase is chosen as the standard for all words. 

words = []

for w in X_:
    words.append(w.lower())
    
    
data['tokens'] = words

In [169]:
# The machines understand numerical language than textual language. Therefore, CountVectorizer is used 
# as a tool for turning words as features and then take note whether the word appear or not. If it 
# appears in document i, it gets a 1 and if doesn't appear it get a 0. Luckily,CountVectorizer also 
# has the stop_words, property.

vectorizer = CountVectorizer(stop_words='english')
tokens = vectorizer.fit_transform(data.tokens)

# furthermore, CountVectorizer has takes not of vocabulary, this vocabulary property counts how many 
# times a word appears in the text. 

vocab=len(vectorizer.vocabulary_) # Here we count the total count of unique words in the text

In [170]:
tokens.shape # Here we can see that we have 107013 docs and 33418 unique words which are repeated 
             # multiple times in the text data

(107013, 33418)

# Computing likelihood and prior 

In [186]:
# Word count of words per class
Label_1 = [data['tokens'] for index,data in data.iterrows() if data['Label'] ==1]
word_count_1 = len(Counter(chain(*Label_1))) 

Label_2 = [data['tokens'] for index,data in data.iterrows() if data['Label'] ==2]
word_count_2 = len(Counter(chain(*Label_2))) 

Label_3 = [data['tokens'] for index,data in data.iterrows() if data['Label'] ==3]
word_count_3 = len(Counter(chain(*Label_3))) 

Label_4 = [data['tokens'] for index,data in data.iterrows() if data['Label'] ==4]
word_count_4 = len(Counter(chain(*Label_4))) 

Label_5 = [data['tokens'] for index,data in data.iterrows() if data['Label'] ==5]
word_count_5 = len(Counter(chain(*Label_5))) 

# Number of data points in class C
N_1 = (data['Label']==1).sum()
N_2 = (data['Label']==2).sum()
N_3 = (data['Label']==3).sum()
N_4 = (data['Label']==4).sum()
N_5 = (data['Label']==5).sum()


# Total number of points
N = len(data)

# Pior of each class C
π_1 = N_1/N
π_2 = N_2/N
π_3 = N_3/N
π_4 = N_4/N
π_5 = N_5/N

# Likelihood for each class C
likelihood_5 = (word_count_5 + 1)/ (N_5 + abs(vocab)) 
likelihood_4 = (word_count_4 + 1)/ (N_4 + abs(vocab)) 
likelihood_3 = (word_count_3 + 1)/ (N_3 + abs(vocab)) 
likelihood_2 = (word_count_2 + 1)/ (N_2 + abs(vocab)) 
likelihood_1 = (word_count_1 + 1)/ (N_1 + abs(vocab))

# Posterior quantities for each class label 

In [189]:
Class_Label1 = Naive_bayes(likelihood_1, π_1)
Class_Label2 = Naive_bayes(likelihood_2, π_2)
Class_Label3 = Naive_bayes(likelihood_3, π_3)
Class_Label4 = Naive_bayes(likelihood_4, π_4)
Class_Label5 = Naive_bayes(likelihood_5, π_5)