In [66]:
import numpy as np
import pandas as pd

import re
import string

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [17]:
def data_read():
    
    """function to read data into pandas dataframe,
    further convert sentiment column into numerical values"""
    
    global df
    
    df = pd.read_csv("data/IMDB Dataset.csv")
    df = df.head(int(len(df)/5))
    df['sentiment']= df['sentiment'].apply(lambda x : 1 if x=='positive' else 0)
    
def html_tag_remover():
    
    """function to remove html tags using regex, 
    and store a copy of dataframe in variable"""
    
    global df_removed_tag
    
    df['review'] = df['review'].str.replace(r'<[^<>]*>', '', regex = True)
    df_removed_tag = df
    
def url_remover():
    
    """function to remove url using regex, 
    and store a copy of dataframe in variable"""
    
    global df_removed_url
    
    df['review'] = df['review'].str.replace(r'https ? ://\s+|www\.\s+', '', regex = True)
    df_removed_tag = df
    
def lowercase():
    
    """function to convert review into lowercase, 
    and store a copy of dataframe in variable"""
    
    global df_lower
    
    df['review'] = df['review'].str.lower()
    df_lower = df
    
def punctuation_remover():
    
    """function to remove punctuation using regex, 
    and store a copy of dataframe in variable"""
    
    global df_punc_removed
    
    df['review'] = df['review'].str.replace('[{}]'.format(string.punctuation), '', regex = True)
    df_punc_removed = df
    
def stopword_remover():
    
    """function to remove stopwords, 
    and store a copy of dataframe in variable"""
    
    global df_stopword_removed
    
    df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))
    df_stopword_removed = df
    
def lemmatize_text():
    
    """function to lemmatize reviews, 
    and store a copy of dataframe in variable"""
    
    global df_lemmatized
    
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    df['review'] = df['review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)]))
    df_lemmatized =  df

In [18]:
def data_preprocess():
    
    data_read()
    html_tag_remover()
    url_remover()
    lowercase()
    punctuation_remover()
    stopword_remover()
    lemmatize_text()
    
    return

In [19]:
data_preprocess()

"""split the dataset
75% for training
25% for testing"""

train, test = train_test_split(df, test_size = 0.25, random_state = 42)

X_train, y_train = train['review'], train['sentiment']
X_test, y_test = test['review'], test['sentiment']

In [50]:
"""Natural Language Processing technique of text modeling known as Bag of Words model. 
> Whenever we apply any algorithm in NLP, it works on numbers. 
> We cannot directly feed our text into that algorithm. 
> Hence, Bag of Words model is used to preprocess the text by converting it into a bag of words, 
> which keeps a count of the total occurrences of most frequently used words"""

#Convert a collection of text documents to a matrix of token counts

cnt_vec = CountVectorizer(ngram_range = (1, 3), binary = True)
x_train_vector = cnt_vec.fit_transform(X_train)
x_test_vector = cnt_vec.transform(X_test)

#Convert a collection of raw documents to a matrix of TF-IDF features.

tfidf = TfidfVectorizer()
x_train_vector_2 = tfidf.fit_transform(X_train)
x_test_vector_2 = tfidf.transform(X_test)

In [55]:
def result(y_pred):
    
    """This function will show results"""
    
    print("Classification Report: \n\n", classification_report(y_test, y_pred))
    print("Confusion Matrix: \n\n", confusion_matrix(y_test, y_pred))
    print("Accuracy: \n\n", accuracy_score(y_test, y_pred))

In [52]:
def multinomial_NaiveB(train_data_vector, test_data_vector):
    
    """function to produce results
    used Multinomial naive Bayes Model"""
    
    multi_clf = MultinomialNB()
    multi_clf.fit(train_data_vector, y_train.values)

    predict_NB = multi_clf.predict(test_data_vector)

    return result(predict_NB)

In [53]:
multinomial_NaiveB(x_train_vector, x_test_vector)

Classification Report: 

               precision    recall  f1-score   support

           0       0.83      0.90      0.86      1220
           1       0.90      0.82      0.86      1280

    accuracy                           0.86      2500
   macro avg       0.86      0.86      0.86      2500
weighted avg       0.86      0.86      0.86      2500

Confusion Matrix: 

 [[1103  117]
 [ 229 1051]]
Accuracy: 

 0.8616


In [54]:
multinomial_NaiveB(x_train_vector_2, x_test_vector_2)

Classification Report: 

               precision    recall  f1-score   support

           0       0.82      0.89      0.85      1220
           1       0.89      0.81      0.85      1280

    accuracy                           0.85      2500
   macro avg       0.85      0.85      0.85      2500
weighted avg       0.85      0.85      0.85      2500

Confusion Matrix: 

 [[1087  133]
 [ 239 1041]]
Accuracy: 

 0.8512


In [57]:
def Linear_SVC(train_data_vector, test_data_vector):
    
    """function to produce results
    used Linear SVC Kernel Model"""

    linear_svc = LinearSVC(C = 0.5, random_state = 42)
    linear_svc.fit(train_data_vector, y_train.values)

    predict_Lin_Svc = linear_svc.predict(test_data_vector)

    return result(predict_Lin_Svc)

In [56]:
Linear_SVC(x_train_vector, x_test_vector)

Classification Report: 

               precision    recall  f1-score   support

           0       0.88      0.86      0.87      1220
           1       0.87      0.89      0.88      1280

    accuracy                           0.87      2500
   macro avg       0.87      0.87      0.87      2500
weighted avg       0.87      0.87      0.87      2500

Confusion Matrix: 

 [[1049  171]
 [ 143 1137]]
Accuracy: 

 0.8744




In [58]:
Linear_SVC(x_train_vector_2, x_test_vector_2)

Classification Report: 

               precision    recall  f1-score   support

           0       0.88      0.86      0.87      1220
           1       0.87      0.89      0.88      1280

    accuracy                           0.88      2500
   macro avg       0.88      0.87      0.88      2500
weighted avg       0.88      0.88      0.88      2500

Confusion Matrix: 

 [[1045  175]
 [ 137 1143]]
Accuracy: 

 0.8752


In [61]:
def Random_Forest(train_data_vector, test_data_vector):
    
    """function to produce results
    used Random Forest Model"""

    forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    forest.fit(train_data_vector, y_train.values)

    forest_predict = forest.predict(test_data_vector)

    return result(forest_predict)

In [62]:
Random_Forest(x_train_vector, x_test_vector)

Classification Report: 

               precision    recall  f1-score   support

           0       0.75      0.80      0.77      1220
           1       0.79      0.74      0.77      1280

    accuracy                           0.77      2500
   macro avg       0.77      0.77      0.77      2500
weighted avg       0.77      0.77      0.77      2500

Confusion Matrix: 

 [[974 246]
 [329 951]]
Accuracy: 

 0.77


In [63]:
Random_Forest(x_train_vector_2, x_test_vector_2)

Classification Report: 

               precision    recall  f1-score   support

           0       0.72      0.82      0.76      1220
           1       0.80      0.69      0.74      1280

    accuracy                           0.75      2500
   macro avg       0.76      0.75      0.75      2500
weighted avg       0.76      0.75      0.75      2500

Confusion Matrix: 

 [[1002  218]
 [ 399  881]]
Accuracy: 

 0.7532


In [67]:
def Logistic_regression(train_data_vector, test_data_vector):
    
    """function to produce results
    used Logistic Regression Model"""

    l_regression = LogisticRegression(random_state = 0)
    l_regression.fit(train_data_vector, y_train.values)

    regression_predict = l_regression.predict(test_data_vector)

    return result(regression_predict)

In [68]:
Logistic_regression(x_train_vector, x_test_vector)

Classification Report: 

               precision    recall  f1-score   support

           0       0.88      0.86      0.87      1220
           1       0.87      0.89      0.88      1280

    accuracy                           0.87      2500
   macro avg       0.87      0.87      0.87      2500
weighted avg       0.87      0.87      0.87      2500

Confusion Matrix: 

 [[1047  173]
 [ 144 1136]]
Accuracy: 

 0.8732


In [69]:
Logistic_regression(x_train_vector_2, x_test_vector_2)

Classification Report: 

               precision    recall  f1-score   support

           0       0.87      0.85      0.86      1220
           1       0.86      0.88      0.87      1280

    accuracy                           0.87      2500
   macro avg       0.87      0.87      0.87      2500
weighted avg       0.87      0.87      0.87      2500

Confusion Matrix: 

 [[1038  182]
 [ 149 1131]]
Accuracy: 

 0.8676
