In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read Dataset
data = pd.read_csv("Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# No null value
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
# Dataset is balanced
data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [5]:
# maping the sentiment positive as 1 and negative as 0
data['sentiment'] = data['sentiment'].map({'positive' : 1, 'negative' : 0})
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [6]:
#  Data Cleaning
#  1. convert lower case
#  2. Remove HTML tags
#  3. Remove Punctuation
#  4. Remove stopwords
#  5. convert digit to word
#  6. stemming

In [7]:
# Convert everything into lowercase
def to_lower(text):
    return text.lower()

In [8]:
# Remove HTML tags
import re
def clean(text):
    cleaned = re.compile(r'<.*?>')
    return re.sub(cleaned,'',text)

In [9]:
# Remove Punctuation
import string
def remove_puntuation(text):
    text_nopunct = ''.join([i for i in text if i not in string.punctuation])
    return text_nopunct

In [10]:
# Remove stopwords
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Biman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# convert digit to number
import inflect 
q = inflect.engine()
def convert_number(text):
    new_str = []
    for word in text:
        if word.isdigit(): 
            temp = q.number_to_words(word) 
            new_str.append(temp)
        else: 
            new_str.append(word)
    return new_str

In [12]:
# Stemming
from nltk.stem import PorterStemmer
def stem_txt(text):
    ps = PorterStemmer()
    return " ".join([ps.stem(w) for w in text])

In [13]:
data['review'] = data['review'].apply(to_lower)
data['review'] = data['review'].apply(clean)
data['review'] = data['review'].apply(lambda x: remove_puntuation(x))
data['review'] = data['review'].apply(remove_stopwords)
data['review'] = data['review'].apply(convert_number)
data['review'] = data['review'].apply(stem_txt)

In [14]:
data['review'][0]

'one review mention watch one oz episod youll hook right exactli happen meth first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use wordit call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti surreal couldnt say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom

In [15]:
data.head()

Unnamed: 0,review,sentiment
0,one review mention watch one oz episod youll h...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


#### create TF-IDF

In [17]:
X = data.iloc[:, 0].values
y = data.iloc[:, 1:].values

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(strip_accents=None, lowercase=False, use_idf=True, norm='l2',smooth_idf=True)

In [19]:
tfidf

TfidfVectorizer(lowercase=False)

In [20]:
X = tfidf.fit_transform(X)

In [21]:
X.shape, y.shape

((50000, 180412), (50000, 1))

##### train test split

In [23]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2,random_state=56)

#### create model

In [25]:
from sklearn.linear_model import LogisticRegressionCV
clf=LogisticRegressionCV(cv=5,
                        scoring='accuracy',
                        random_state=0,
                        n_jobs=-1,
                        verbose=3,
                        max_iter=300).fit(train_X,train_y)

  return f(**kwargs)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.7min remaining:  5.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.8min finished


In [26]:
y_predict = clf.predict(test_X)

#### Model Evaluate

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [28]:
cm=confusion_matrix(test_y,y_predict)
cm

array([[4362,  595],
       [ 470, 4573]], dtype=int64)

In [29]:
accuracy_score(test_y,y_predict)

0.8935

In [30]:
print(classification_report(test_y,y_predict))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4957
           1       0.88      0.91      0.90      5043

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



## Random Forest

#### Model Creation

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
rand_clf = RandomForestClassifier(random_state=6)
rand_clf.fit(train_X,train_y)

  rand_clf.fit(train_X,train_y)


RandomForestClassifier(random_state=6)

In [35]:
pred_y = rand_clf.predict(test_X)

#### Model Evaluate

In [36]:
cm=confusion_matrix(test_y,pred_y)
cm

array([[4197,  760],
       [ 777, 4266]], dtype=int64)

In [37]:
accuracy_score(test_y,pred_y)

0.8463

In [38]:
print(classification_report(test_y,pred_y))

              precision    recall  f1-score   support

           0       0.84      0.85      0.85      4957
           1       0.85      0.85      0.85      5043

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



#### Parameter Tunning

In [39]:
# we are tuning three hyperparameters right now, we are passing the different values for both parameters
grid_param = {
    "n_estimators" : [90,100,115,130],
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,20,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'max_features' : ['auto','log2']
}

In [40]:
from sklearn.model_selection import GridSearchCV

In [41]:
grid_search = GridSearchCV(estimator=rand_clf,param_grid=grid_param,cv=5,n_jobs =-1,verbose = 3)

In [None]:
grid_search.fit(train_X,train_y)

Fitting 5 folds for each of 20736 candidates, totalling 103680 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 19.1min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed: 25.3min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed: 31.6min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 38.7min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 48.3min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed: 59.7min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed: 69.7min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed: 79.0min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed: 96.3min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e

In [None]:
#let's see the best parameters as per our grid search
grid_search.best_params_