# Importing Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re
import string

import nltk
from nltk.corpus import stopwords
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Data Preprocessing

> **About Dataset**

> IMDB dataset having 50K movie reviews for natural language processing or Text analytics.

> This is a dataset for binary sentiment classification

> a set of 25,000 highly polar movie reviews for training and 25,000 for testing is provided

> For more dataset information, please go through the following link :
http://ai.stanford.edu/~amaas/data/sentiment/

In [2]:
def data_read():
    
    """function to read data into pandas dataframe,
    further convert sentiment column into numerical values"""
    
    global df
    
    df = df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
    df['sentiment']= df['sentiment'].apply(lambda x : 1 if x=='positive' else 0)
    
def html_tag_remover():
    
    """function to remove html tags using regex, 
    and store a copy of dataframe in variable"""
    
    global df_removed_tag
    
    df['review'] = df['review'].str.replace(r'<[^<>]*>', '', regex = True)
    df_removed_tag = df
    
def url_remover():
    
    """function to remove url using regex, 
    and store a copy of dataframe in variable"""
    
    global df_removed_url
    
    df['review'] = df['review'].str.replace(r'https ? ://\s+|www\.\s+', '', regex = True)
    df_removed_tag = df
    
def lowercase():
    
    """function to convert review into lowercase, 
    and store a copy of dataframe in variable"""
    
    global df_lower
    
    df['review'] = df['review'].str.lower()
    df_lower = df
    
def punctuation_remover():
    
    """function to remove punctuation using regex, 
    and store a copy of dataframe in variable"""
    
    global df_punc_removed
    
    df['review'] = df['review'].str.replace('[{}]'.format(string.punctuation), '', regex = True)
    df_punc_removed = df
    
def stopword_remover():
    
    """function to remove stopwords, 
    and store a copy of dataframe in variable"""
    
    global df_stopword_removed
    
    df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))
    df_stopword_removed = df
    
def lemmatize_text():
    
    """function to lemmatize reviews, 
    and store a copy of dataframe in variable"""
    
    global df_lemmatized
    
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    df['review'] = df['review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)]))
    df_lemmatized =  df

In [None]:
def data_preprocess():
    
    data_read()
    html_tag_remover()
    url_remover()
    lowercase()
    punctuation_remover()
    stopword_remover()
    lemmatize_text()
    
    return

# classifier

In [10]:
data_preprocess()

"""split the dataset
75% for training
25% for testing"""

train, test = train_test_split(df, test_size = 0.25, random_state = 42)

X_train, y_train = train['review'], train['sentiment']
X_test, y_test = test['review'], test['sentiment']

In [12]:
"""Natural Language Processing technique of text modeling known as Bag of Words model. 
> Whenever we apply any algorithm in NLP, it works on numbers. 
> We cannot directly feed our text into that algorithm. 
> Hence, Bag of Words model is used to preprocess the text by converting it into a bag of words, 
> which keeps a count of the total occurrences of most frequently used words"""

tfidf = TfidfVectorizer()
x_train_vector = tfidf.fit_transform(X_train)
x_train_vector.toarray()

x_test_vector = tfidf.transform(X_test)

In [16]:
multi_clf = MultinomialNB()
multi_clf.fit(x_train_vector, y_train.values)

predict_NB = multi_clf.predict(x_test_vector)

In [18]:
print("Classification Report: \n\n", classification_report(y_test, predict_NB))
print("Confusion Matrix: \n\n", confusion_matrix(y_test, predict_NB))
print("Accuracy: \n\n", accuracy_score(y_test, predict_NB))

Classification Report: 

               precision    recall  f1-score   support

           0       0.85      0.89      0.87      6157
           1       0.88      0.84      0.86      6343

    accuracy                           0.87     12500
   macro avg       0.87      0.87      0.87     12500
weighted avg       0.87      0.87      0.87     12500

Confusion Matrix: 

 [[5459  698]
 [ 989 5354]]
Accuracy: 

 0.86504


# result

* creating a dataframe from test data 
* review column contains reviews
* sentiment column contains actual sentiments
* predicted_sentiment column contains predicted sentiments

In [28]:
dataset_predict = X_test.copy()
dataset_predict = pd.DataFrame(dataset_predict)
dataset_predict.columns = ['review']
dataset_predict = dataset_predict.reset_index()
dataset_predict = dataset_predict.drop(['index'], axis = 1)

test_actual_label = (y_test.values).copy()
test_actual_label = pd.DataFrame(test_actual_label)
test_actual_label.columns = ['sentiment']
test_actual_label['sentiment'] = test_actual_label['sentiment'].replace({1: 'positive', 0: 'negative'})

test_predicted_label = predict_NB.copy()
test_predicted_label = pd.DataFrame(test_predicted_label)
test_predicted_label.columns = ['predicted_sentiment']
test_predicted_label['predicted_sentiment'] = test_predicted_label['predicted_sentiment'].replace({1: 'positive', 0: 'negative'})

test_result = pd.concat([dataset_predict, test_actual_label, test_predicted_label], axis=1)
test_result

Unnamed: 0,review,sentiment,predicted_sentiment
0,really liked summerslam due look arena curtain...,positive,positive
1,many television show appeal quite many differe...,positive,positive
2,film quickly get major chase scene ever increa...,negative,negative
3,jane austen would definitely approve onegwynet...,positive,positive
4,expectation somewhat high went see movie thoug...,negative,negative
...,...,...,...
12495,first separate story film story second continu...,negative,negative
12496,obvious flawhorrible horrible script movie pot...,negative,negative
12497,brilliance movie even competent dentist pretty...,positive,negative
12498,yaitate japan really fun show really like show...,positive,positive
