IMDB MOVIE REVIEW

**Task:** *Goal of this project is to predict the number of positive and negative reviews using classification*

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from warnings import filterwarnings
filterwarnings(action='ignore')

In [None]:
df= pd.read_excel("IMDB_dataset.xlsx")

df.head()

Unnamed: 0,review,sentiment
0,I thought this was a wonderful way to spend ti...,positive
1,"Probably my all-time favorite movie, a story o...",positive
2,I sure would like to see a resurrection of a u...,positive
3,"This show was an amazing, fresh & innovative i...",negative
4,Encouraged by the positive comments about this...,negative


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     25000 non-null  object
 1   sentiment  25000 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB


In [None]:
! pip install nltk



### Removing punctuation

In [None]:
from nltk.tokenize import RegexpTokenizer
import re
import string
import math

df["review"] = df["review"].str.lower()


PUNCT_TO_REMOVE = string.punctuation
def punctuation_removal(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["review"] = df["review"].apply(lambda text: punctuation_removal(text))
imdb.head()

Unnamed: 0,review,sentiment
0,i thought this was a wonderful way to spend ti...,positive
1,probably my alltime favorite movie a story of ...,positive
2,i sure would like to see a resurrection of a u...,positive
3,this show was an amazing fresh innovative ide...,negative
4,encouraged by the positive comments about this...,negative


### Performing Tokenization

In [None]:
from nltk.tokenize import RegexpTokenizer

def tokenizer_fun(txt):
    regexp_tokenizer = RegexpTokenizer('\s+', gaps = True)
    return regexp_tokenizer.tokenize(txt)

imdb['review'] = imdb['review'].apply(lambda x: tokenizer_fun(x))
imdb.head()

Unnamed: 0,review,sentiment
0,"[i, thought, this, was, a, wonderful, way, to,...",positive
1,"[probably, my, alltime, favorite, movie, a, st...",positive
2,"[i, sure, would, like, to, see, a, resurrectio...",positive
3,"[this, show, was, an, amazing, fresh, innovati...",negative
4,"[encouraged, by, the, positive, comments, abou...",negative


### Removing stopwords

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')

def stopwords_removal(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

df['review'] = df['review'].apply(lambda x: stopwords_removal(x))

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,"['i', 'thought', 'this', 'was', 'a', 'wonderfu...",positive
1,"['probably', 'my', 'alltime', 'favorite', 'mov...",positive
2,"['i', 'sure', 'would', 'like', 'to', 'see', 'a...",positive
3,"['this', 'show', 'was', 'an', 'amazing', 'fres...",negative
4,"['encouraged', 'by', 'the', 'positive', 'comme...",negative


### Lemmatize/Stem

In [None]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stemming(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["review"] = df["review"].apply(lambda text: stemming(text))
df.head()

Unnamed: 0,review,sentiment
0,"['i', 'thought', 'this', 'was', 'a', 'wonderfu...",positive
1,"['probably', 'my', 'alltime', 'favorite', 'mov...",positive
2,"['i', 'sure', 'would', 'like', 'to', 'see', 'a...",positive
3,"['this', 'show', 'was', 'an', 'amazing', 'fres...",negative
4,"['encouraged', 'by', 'the', 'positive', 'comme...",negative


In [None]:
mapping = {'negative': 0, 'positive': 1}
df['sentiment'] = df['sentiment'].map(mapping)

df.head()

Unnamed: 0,review,sentiment
0,"['i', 'thought', 'this', 'was', 'a', 'wonderfu...",1
1,"['probably', 'my', 'alltime', 'favorite', 'mov...",1
2,"['i', 'sure', 'would', 'like', 'to', 'see', 'a...",1
3,"['this', 'show', 'was', 'an', 'amazing', 'fres...",0
4,"['encouraged', 'by', 'the', 'positive', 'comme...",0


In [None]:
from sklearn.model_selection import train_test_split

positive = df[df['sentiment'] == 1].sample(1000)
negative= df[df['sentiment'] == 0].sample(1000)

bal_data = pd.concat([positive, negative], ignore_index=True)

# Split the data into features (X) and target (y)
X = bal_data['review']
y = bal_data['sentiment']

In [None]:
feature = bal_data['review']
label= bal_data['sentiment']

In [None]:
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(feature, label,
    test_size=0.4, shuffle = True, random_state = 12)

### Perform TFIDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)

vactorize_x_train = tfidf.fit_transform(X_train)
vactorize_x_test = tfidf.transform(X_test)

### Random forest

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

def print_result(outputs):
    print('Parameter: {}\n'.format(outputs.best_params_))

    mean = outputs.cv_results_['mean_test_score']
    std = outputs.cv_results_['std_test_score']
    for mean, std, params in zip(mean, std, outputs.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

rf = RandomForestClassifier()
params = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, params, cv=5)
cv.fit(vactorize_x_train, y_train)

print_result(cv)

In [None]:
best_RF_model=cv.best_estimator_

### XGBoost

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

def print_result(outputs):
    print('Parameter: {}\n'.format(outputs.best_params_))

    mean = outputs.cv_results_['mean_test_score']
    std = outputs.cv_results_['std_test_score']
    for mean, std, params in zip(mean, std, outputs.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

xgboost = XGBClassifier()
params = {
       'n_estimators': [5, 50, 250],
        'learning_rate': [0.01, 0.1, 0.15],
        }

cv = GridSearchCV(xgboost, params, cv=5)
cv.fit(vactorize_x_train, y_train)

print_result(cv)

Parameter: {'learning_rate': 0.1, 'n_estimators': 50}

0.668 (+/-0.077) for {'learning_rate': 0.01, 'n_estimators': 5}
0.688 (+/-0.103) for {'learning_rate': 0.01, 'n_estimators': 50}
0.744 (+/-0.113) for {'learning_rate': 0.01, 'n_estimators': 250}
0.682 (+/-0.082) for {'learning_rate': 0.1, 'n_estimators': 5}
0.771 (+/-0.093) for {'learning_rate': 0.1, 'n_estimators': 50}
0.769 (+/-0.09) for {'learning_rate': 0.1, 'n_estimators': 250}
0.691 (+/-0.059) for {'learning_rate': 0.15, 'n_estimators': 5}
0.758 (+/-0.059) for {'learning_rate': 0.15, 'n_estimators': 50}
0.77 (+/-0.083) for {'learning_rate': 0.15, 'n_estimators': 250}


In [None]:
best_XGBoost_model=cv.best_estimator_

### Final evaluation of models

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def model_evoluting(name, model, features, labels):
    predict = model.predict(features)
    accuracy = round(accuracy_score(labels, predict), 3)
    precision = round(precision_score(labels, predict, average='weighted'), 3)
    recall = round(recall_score(labels, predict, average='weighted'), 3)
    print('{} -- Accuracy: {} / Precision: {} / Recall: {}'.format(name, accuracy, precision, recall))

In [None]:
model_evoluting("Random Forest", best_RF_model, vactorize_x_test, y_test)

Random Forest -- Accuracy: 0.788 / Precision: 0.788 / Recall: 0.788


In [None]:
model_evoluting("XGBoost", best_XGBoost_model, vactorize_x_test, y_test)

XGBoost -- Accuracy: 0.766 / Precision: 0.767 / Recall: 0.766


### Report the best performing Model

As we can see Random Forest with Paramater like {'max_depth': 16, 'n_estimators': 250} is the better at some extent than XGBoost.