# Reading Data

In [165]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [166]:
#Load english tokenizer,tagger,parser and nera(named entity recognizer)

import spacy
nlp = spacy.load('en_core_web_sm')

In [167]:
data = pd.read_csv('/content/drive/MyDrive/TXTA PROJ/IMDb-sample.csv',header=0)

In [168]:
data.head(5)

Unnamed: 0,Index,URL,Text,Sentiment
0,3617,http://www.imdb.com/title/tt0210075/usercomments,Girlfight follows a project dwelling New York ...,POS
1,3671,http://www.imdb.com/title/tt0337640/usercomments,Hollywood North is an euphemism from the movie...,POS
2,3157,http://www.imdb.com/title/tt0303549/usercomments,That '70s Show is definitely the funniest show...,POS
3,660,http://www.imdb.com/title/tt0716825/usercomments,"9/10- 30 minutes of pure holiday terror. Okay,...",POS
4,265,http://www.imdb.com/title/tt0182225/usercomments,"A series of random, seemingly insignificant th...",POS


In [169]:
data.drop(columns=['Index','URL'],axis=1,inplace=True)

In [170]:
data.head(5)

Unnamed: 0,Text,Sentiment
0,Girlfight follows a project dwelling New York ...,POS
1,Hollywood North is an euphemism from the movie...,POS
2,That '70s Show is definitely the funniest show...,POS
3,"9/10- 30 minutes of pure holiday terror. Okay,...",POS
4,"A series of random, seemingly insignificant th...",POS


In [171]:
# Assign column names
columan_name = ['Review', 'Sentiment']
data.columns = columan_name

In [172]:
data.head()

Unnamed: 0,Review,Sentiment
0,Girlfight follows a project dwelling New York ...,POS
1,Hollywood North is an euphemism from the movie...,POS
2,That '70s Show is definitely the funniest show...,POS
3,"9/10- 30 minutes of pure holiday terror. Okay,...",POS
4,"A series of random, seemingly insignificant th...",POS


In [173]:
data.shape
# 2000 rows (reviews), 2 columns (Sentiments)

(2000, 2)

In [174]:
data['Sentiment'].value_counts()

# 1346 positive reviews
# 1362 Negative reviews

POS    1000
NEG    1000
Name: Sentiment, dtype: int64

In [175]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     2000 non-null   object
 1   Sentiment  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [176]:
data['Sentiment']=data['Sentiment'].apply(lambda x: 1 if x == 'POS' else 0)


In [177]:
data.head(5)

Unnamed: 0,Review,Sentiment
0,Girlfight follows a project dwelling New York ...,1
1,Hollywood North is an euphemism from the movie...,1
2,That '70s Show is definitely the funniest show...,1
3,"9/10- 30 minutes of pure holiday terror. Okay,...",1
4,"A series of random, seemingly insignificant th...",1


In [178]:
# check for null values
data.isnull().sum()

# no null values in the data

Review       0
Sentiment    0
dtype: int64

# Cleaning Data

In [179]:
import string

In [180]:
punct = string.punctuation

In [181]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [182]:
from spacy.lang.en.stop_words import STOP_WORDS

In [183]:
stopwords = list(STOP_WORDS) # list of stopwords

In [184]:
# creating a function for data cleaning

In [185]:
def text_data_cleaning(sentence):
  doc = nlp(sentence)

  tokens = [] # list of tokens
  for token in doc:
    if token.lemma_ != "-PRON-":         # if not proper noun then lemma exist , so first lemma then lower case
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_               # if proper noun then no lemma exist , so only lower case
    tokens.append(temp)
 
  cleaned_tokens = []
  for token in tokens:
    if token not in stopwords and token not in punct:
      cleaned_tokens.append(token)
  return cleaned_tokens

In [186]:
# if root form of that word is not pronoun then it is going to convert that into lower form
# and if that word is a proper noun, then we are directly taking lower form, because there is no lemma for proper noun

In [187]:
#checking this functiom on some sample sentence
text_data_cleaning("Hello all, It's a beautiful day outside there!")
# stopwords and punctuations removed

['hello', 'beautiful', 'day', 'outside']

# Vectorization Feature Engineering (TF-IDF)

In [208]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [189]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
# tokenizer=text_data_cleaning, tokenization will be done according to this function

#  Train the model

In [204]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [205]:
x_train.shape, x_test.shape
# 1600 samples in training dataset and 400 in test dataset

((1600,), (400,))

In [206]:
x_train.head()

582     I'm sick of people whining about Ewoks! True, ...
159     The first step to getting off of that road tha...
1827    The Pallbearer is a disappointment and at time...
318     I first saw a track from this DVD at a hifi sh...
708     All in all a good film and better for the fact...
Name: Review, dtype: object

# Fit the x_train and y_train

In [211]:
from sklearn.model_selection import GridSearchCV

In [210]:
model = XGBClassifier()

In [212]:
# Hypertuning

param_grid = {
    'model__n_estimators': [10, 100, 500]}

In [213]:
# Grid Search CV

grid = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='roc_auc')

In [214]:
#Pipeline

Classifier = Pipeline([('tfidf',tfidf), ('clf',grid)])
# it will first do vectorization and then it will do classification

In [215]:
Classifier.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                                                      n_estimators=100,
                                                      n_jobs=1, nthread=None,
                                                      objective='binary:logistic',
                                                      random_state=0,

In [216]:
mean_score = grid.cv_results_["mean_test_score"][grid.best_index_]
std_score = grid.cv_results_["std_test_score"][grid.best_index_]

grid.best_params_, mean_score, std_score

print(f"Best parameters: {grid.best_params_}")
print(f"Mean CV score: {mean_score: .6f}")
print(f"Standard deviation of CV score: {std_score: .6f}")

Best parameters: {'model__n_estimators': 10}
Mean CV score:  0.943289
Standard deviation of CV score:  0.004326


In [197]:
# in this we don't need to prepare the dataset for testing(x_test)

# Predict the Test set results

In [217]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [218]:
y_pred = Classifier.predict(x_test)

In [219]:
# confusion_matrix
confusion_matrix(y_test, y_pred)

array([[154,  46],
       [ 18, 182]])

In [220]:
# classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.90      0.77      0.83       200
           1       0.80      0.91      0.85       200

    accuracy                           0.84       400
   macro avg       0.85      0.84      0.84       400
weighted avg       0.85      0.84      0.84       400



In [222]:
accuracy_score(y_test, y_pred)
# 84% accuracy

0.84