In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
# Let's do some basics import and CountVectorizer so to call the transform() function on one or more documents as needed to encode each as a vector
from os import path
from pandas import DataFrame
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import re

In [3]:
# Let's import some NLP modules such as PorterStemmer, SnowballStemmer, WordNetLemmetizer
# download vader_lexicon, and stopwords

import nltk
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer    # Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meaning to one word.
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Let's import some visualization modules

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import style
import matplotlib.colors

In [5]:
import wordcloud   # Sentiment-based Word Clouds
from wordcloud import WordCloud, STOPWORDS 
from PIL import Image

In [6]:
# Change and set directory to kaggle/input

os.chdir('/kaggle/input')
os.getcwd()

'/kaggle/input'

In [7]:
# Let's read IMDB Dataset and store it into a dataframe "df"

df=pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv',header=0,error_bad_lines=True,encoding='utf8')

df.dtypes

review       object
sentiment    object
dtype: object

In [8]:
# Let's look at our table
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
# Let's define a function "sc" to run sentimental analysis on the text "review" and return the compound value (-1 to +1)
def sc(x):
    score=SentimentIntensityAnalyzer().polarity_scores(x)
    return score['compound']

In [10]:
## Let's apply the compound score of our sentimental analysis to "review" storing the results in a new column "SentScore" through 
# map function

df["SentScore"]=df["review"].map(sc)

In [11]:
# Let's look at our updated table 
df.head()

Unnamed: 0,review,sentiment,SentScore
0,One of the other reviewers has mentioned that ...,positive,-0.9951
1,A wonderful little production. <br /><br />The...,positive,0.9641
2,I thought this was a wonderful way to spend ti...,positive,0.9605
3,Basically there's a family where a little boy ...,negative,-0.9213
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,0.9744


In [12]:
# Let's define a function "sc" to run sentimental analysis on the text "review" and return the compound value (-1 to +1)


def sca(lb):
    if lb >= .6:
        return "Very Good"
    elif (lb > .2) and (lb < .6):
        return "Good"
    elif (lb > -.2) and (lb < .2):
        return "Average"
    elif (lb > -.6) and (lb < -.2):
        return "Disappointing"
     
    else:
        return "Regrettable"

In [13]:
# Now we insert a column to indicate the class of the review ("Very Good" , "Good", "Average", "Disappointing", "Regrettable")

df["SentClass"]=df["SentScore"].map(sca)

In [14]:
# Let's check our updated table

df.head(15)

Unnamed: 0,review,sentiment,SentScore,SentClass
0,One of the other reviewers has mentioned that ...,positive,-0.9951,Regrettable
1,A wonderful little production. <br /><br />The...,positive,0.9641,Very Good
2,I thought this was a wonderful way to spend ti...,positive,0.9605,Very Good
3,Basically there's a family where a little boy ...,negative,-0.9213,Regrettable
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,0.9744,Very Good
5,"Probably my all-time favorite movie, a story o...",positive,0.9828,Very Good
6,I sure would like to see a resurrection of a u...,positive,0.9022,Very Good
7,"This show was an amazing, fresh & innovative i...",negative,0.8596,Very Good
8,Encouraged by the positive comments about this...,negative,0.2362,Good
9,If you like original gut wrenching laughter yo...,positive,0.9149,Very Good


In [15]:
# We define a function for which relatively to the "sentiment" column, positive=1 | negative=0

def num(lb):
    if lb == 'positive':
        return 1   
    else:
        return 0

In [16]:
# let's create a new column "sentiment_bin" applying the function above using .map

df["sentiment_bin"]=df["sentiment"].map(num)

In [17]:
# Let's check the updated table

df.head(15)

Unnamed: 0,review,sentiment,SentScore,SentClass,sentiment_bin
0,One of the other reviewers has mentioned that ...,positive,-0.9951,Regrettable,1
1,A wonderful little production. <br /><br />The...,positive,0.9641,Very Good,1
2,I thought this was a wonderful way to spend ti...,positive,0.9605,Very Good,1
3,Basically there's a family where a little boy ...,negative,-0.9213,Regrettable,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,0.9744,Very Good,1
5,"Probably my all-time favorite movie, a story o...",positive,0.9828,Very Good,1
6,I sure would like to see a resurrection of a u...,positive,0.9022,Very Good,1
7,"This show was an amazing, fresh & innovative i...",negative,0.8596,Very Good,0
8,Encouraged by the positive comments about this...,negative,0.2362,Good,0
9,If you like original gut wrenching laughter yo...,positive,0.9149,Very Good,1


In [18]:
# Similarly to what we did above, for the SentScore results (-1 to +1) we define a function for which a value >= 0 equals 1(positive), else 0(negative)

def numscore(lb):
    if lb >= 0:
        return 1     
    else:
        return 0

In [19]:
# let's create a new column "SentScore_bin" applying the function above using .map

df["SentScore_bin"]=df["SentScore"].map(numscore)

In [20]:
# Let's check the updated table

df.head(15)

Unnamed: 0,review,sentiment,SentScore,SentClass,sentiment_bin,SentScore_bin
0,One of the other reviewers has mentioned that ...,positive,-0.9951,Regrettable,1,0
1,A wonderful little production. <br /><br />The...,positive,0.9641,Very Good,1,1
2,I thought this was a wonderful way to spend ti...,positive,0.9605,Very Good,1,1
3,Basically there's a family where a little boy ...,negative,-0.9213,Regrettable,0,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,0.9744,Very Good,1,1
5,"Probably my all-time favorite movie, a story o...",positive,0.9828,Very Good,1,1
6,I sure would like to see a resurrection of a u...,positive,0.9022,Very Good,1,1
7,"This show was an amazing, fresh & innovative i...",negative,0.8596,Very Good,0,1
8,Encouraged by the positive comments about this...,negative,0.2362,Good,0,1
9,If you like original gut wrenching laughter yo...,positive,0.9149,Very Good,1,1


In [None]:
# Let's do now some TEXT ADJUSTMENTS / CLEANING

In [21]:
# Make text lower case
df["review"]  = df["review"].str.lower()

In [22]:
# Remove digits from text
def Remove_digit(text):
    result = re.sub(r"\d", "", text)
    return result

In [23]:
# Remove HTML from text
def remove_html(text):
    result = re.sub(r'<.*?>','',text) # Find out anything that is in between < & > symbol 
    return result

In [24]:
# Remove special text characters
def remove_spl(text):
    result = re.sub(r'\W',' ',text) 
    return result

In [25]:
# Link words with similar meaning to one word (in context)
def lem_word(text):
    result= WordNetLemmatizer().lemmatize(text)
    return result

In [26]:
# Let's apply all of the above functions to the text column "review"

df["review"]  = df["review"].apply(Remove_digit)
df["review"]  = df["review"].apply(remove_html)
df["review"]  = df["review"].apply(remove_spl)
df["review"]  = df["review"].apply(lem_word)

In [27]:
# Let's check the updated table

df.head()

Unnamed: 0,review,sentiment,SentScore,SentClass,sentiment_bin,SentScore_bin
0,one of the other reviewers has mentioned that ...,positive,-0.9951,Regrettable,1,0
1,a wonderful little production the filming tec...,positive,0.9641,Very Good,1,1
2,i thought this was a wonderful way to spend ti...,positive,0.9605,Very Good,1,1
3,basically there s a family where a little boy ...,negative,-0.9213,Regrettable,0,0
4,petter mattei s love in the time of money is...,positive,0.9744,Very Good,1,1


In [28]:
# Let's store the adjusted text to the object 'corpus1' and transform it into a List
corpus1=df['review'].tolist()

In [89]:
# Let's create an object "corpus" that includes the first 1000 values of the list 'corpus1', otherwise the machine could take too long to run the command

corpus=corpus1[ :1000]

In [30]:
# Count Vectorisation
# I have defined ngram range to be unigrams and bigrams (it starts with one word and goes up to two when vectorizing)

from sklearn.feature_extraction import text

cv = text.CountVectorizer(input=corpus,ngram_range=(1,2),stop_words='english')
matrix = cv.fit_transform(corpus)

# I am converting the matrix_cv into a dataframe 
corpus2 = pd.DataFrame(matrix.toarray(), columns=cv.get_feature_names())

In [78]:
# Let's take a snapshot at the data
corpus2.head()

Unnamed: 0,_fargo_,_fargo_ just,_inspire_,_inspire_ audience,aaargh,aaargh wes,aaliyah,aaliyah soul,aamir,aamir khan,...,zulu,zulu simply,zwick,zwick shame,zzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzz imdb,élan,élan unique,ísnt,ísnt entertaining
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
# One thing to notice here is the dimension of this data
# We have 1000 documents (rows) which is consistent with the selected amount of rows of our list corpus 
# and 110012 columns which is humangous. We have a created a giant matrix

# It is noticeable that many features contain 0, since not all words willbe present across documents of the corpus(2)

corpus2.shape

(1000, 110012)

In [32]:
# TF-IDF, Term Frequency and Inverse Document Freq
# We run a TF-IDF representation on the same corpus, same like before and also this time removing the english stop_words


tf = text.TfidfVectorizer(input=corpus, ngram_range=(1,2),stop_words='english')

matrix1 = tf.fit_transform(corpus)

# I am converting the matrix1 into a dataframe X
X = pd.DataFrame(matrix1.toarray(), columns=tf.get_feature_names())

In [33]:
# Let's take a look at our matrix X

X.head()

Unnamed: 0,_fargo_,_fargo_ just,_inspire_,_inspire_ audience,aaargh,aaargh wes,aaliyah,aaliyah soul,aamir,aamir khan,...,zulu,zulu simply,zwick,zwick shame,zzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzz imdb,élan,élan unique,ísnt,ísnt entertaining
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Let's set our y to be the first 1000 values of the column SentScore_bin (based on our sentiment analysis)

y = df['SentScore_bin'][:1000].values

In [35]:
# Let's take a look at the array 'y'
print(y)

[0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 0 1 1 0 1 1 0 0 0 0 0 0 1 1 0 0
 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 1 1 1
 1 1 0 0 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 0 0 1 1 1 1 0 0 1 1 0 1 1 0
 0 1 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 1 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 0 1 0 1
 0 0 1 1 0 1 0 1 1 1 1 1 1 0 0 1 1 0 1 0 0 1 1 1 1 1 0 0 1 0 1 0 1 0 1 0 1
 1 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 1 1 0 0 1 1 0 1
 1 0 1 1 0 1 0 1 0 1 1 1 0 1 1 0 0 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 1 0 0 1
 1 1 1 0 1 0 1 0 0 1 1 1 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 1 1 1 1 1 1 1 0
 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 1
 1 0 0 1 1 0 0 0 1 1 0 0 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 0 0 1
 1 1 1 0 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 1 1 1 1 0 1 1 0
 0 1 0 1 0 0 1 0 0 1 0 1 0 1 1 0 0 1 1 1 0 0 1 1 0 1 1 0 0 1 1 1 1 0 1 1 0
 0 1 1 1 0 0 1 0 0 1 1 1 

In [36]:
# We are going to try and run the RandomForest Classifier on X= vectorized matrix and y= SentScore_bin
# using the fit transformation of the tf - idf matrix to array =X

# Let's split X and y in training and testing data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=23)

In [37]:
# Let's set the RandomForestClassifier and set the parameters
# Let's fit the model on X and y training data

from sklearn.ensemble import RandomForestClassifier
text_classifier=RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.3, min_samples_leaf=4, min_samples_split=9, n_estimators=100)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=0.3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=9,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [38]:
# Let's run the prediction on the X test data and store them into the object 'predictions'

predictions = text_classifier.predict(X_test)

In [39]:
# We can see that running the RANDOM FOREST CLASSIFIER we get an accuracy score of 68%. NOT amazing!

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
 
print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))  
print(accuracy_score(y_test, predictions))

[[ 32  45]
 [ 19 104]]
              precision    recall  f1-score   support

           0       0.63      0.42      0.50        77
           1       0.70      0.85      0.76       123

    accuracy                           0.68       200
   macro avg       0.66      0.63      0.63       200
weighted avg       0.67      0.68      0.66       200

0.68


In [None]:
# Let's try LOGISTIC REGRESSION

In [40]:
from sklearn.linear_model import LogisticRegression

In [41]:
#training the model
lr=LogisticRegression(C=1.0,class_weight=None,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=100,
multi_class='auto',n_jobs=None,penalty='l2',random_state=23,solver='lbfgs',tol=0.0001,verbose=0,warm_start=False)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(X_train,y_train)
print(lr_tfidf)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=23, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [42]:
##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(X_test)
print(lr_tfidf_predict)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [43]:
# Accuracy score running a LOGISTIC REGRESSION is pretty low.....only 61.5%!

#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(y_test,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.615


In [44]:
#Classification report for tfidf features
lr_tfidf_report=classification_report(y_test,lr_tfidf_predict,target_names=['0','1'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        77
           1       0.61      1.00      0.76       123

    accuracy                           0.61       200
   macro avg       0.31      0.50      0.38       200
weighted avg       0.38      0.61      0.47       200



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# GRADIENT BOOSTING CLASSIFIER

In [45]:
from sklearn.ensemble import GradientBoostingClassifier

In [46]:
clf=GradientBoostingClassifier(n_estimators=80,random_state=23)

In [47]:
clf.fit(X_train,y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=80,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=23, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [48]:
clf.score(X_test,y_test)

0.725

In [49]:
from sklearn.model_selection import GridSearchCV
mod=GridSearchCV(clf,param_grid={'n_estimators': [80,100,120,140,160]})

In [50]:
mod.fit(X_train,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=80,
                                                  n_iter_no_change=None,
       

In [51]:
mod.best_estimator_

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=80,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=23, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [52]:
clf=GradientBoostingClassifier(n_estimators=100,random_state=23)
clf.fit(X_train,y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=23, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [53]:
clf.score(X_test,y_test)

0.715

In [54]:
clf.feature_importances_

array([0., 0., 0., ..., 0., 0., 0.])

In [55]:
feature_imp=pd.Series(clf.feature_importances_)
feature_imp.sort_values(ascending=False)

6343      0.067244
41101     0.061226
108616    0.056553
8108      0.043326
65082     0.035758
            ...   
73273     0.000000
73274     0.000000
73275     0.000000
73276     0.000000
0         0.000000
Length: 110012, dtype: float64

In [None]:
# Let's now repeat the operations setting the first 1000 values of our column 'sentiment_bin' as our "y"

In [67]:
y = df['sentiment_bin'][:1000].values

In [68]:
# We split again in training and test data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=23)

In [59]:
# RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
text_classifier=RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.3, min_samples_leaf=4, min_samples_split=9, n_estimators=100)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=0.3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=9,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [60]:
predictions = text_classifier.predict(X_test)

In [61]:
# It seems to be more accurate with a score of 78%!

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
 
print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))  
print(accuracy_score(y_test, predictions))

[[76 27]
 [18 79]]
              precision    recall  f1-score   support

           0       0.81      0.74      0.77       103
           1       0.75      0.81      0.78        97

    accuracy                           0.78       200
   macro avg       0.78      0.78      0.77       200
weighted avg       0.78      0.78      0.77       200

0.775


In [62]:
# Let's try again LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
#training the model
lr=LogisticRegression(C=1.0,class_weight=None,dual=False,fit_intercept=True,intercept_scaling=1,l1_ratio=None,max_iter=100,
multi_class='auto',n_jobs=None,penalty='l2',random_state=23,solver='lbfgs',tol=0.0001,verbose=0,warm_start=False)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(X_train,y_train)
print(lr_tfidf)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=23, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [63]:
##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(X_test)
print(lr_tfidf_predict)

[0 1 1 0 0 1 0 1 0 1 1 0 0 1 1 1 1 0 0 1 0 1 0 0 1 1 0 0 1 1 1 1 1 0 1 0 1
 0 0 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 0 0 1 0 0 0
 1 0 1 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 1 0 0 1 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0
 1 0 0 1 0 1 0 0 1 1 0 0 0 0 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1
 1 1 1 1 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 0
 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0]


In [64]:
# Logistic regression with y=sentiment_bin gives us the highest accuracy----81%! Not bad

#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(y_test,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.805


In [65]:
#Classification report for tfidf features
lr_tfidf_report=classification_report(y_test,lr_tfidf_predict,target_names=['0','1'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

           0       0.87      0.73      0.79       103
           1       0.75      0.89      0.82        97

    accuracy                           0.81       200
   macro avg       0.81      0.81      0.80       200
weighted avg       0.82      0.81      0.80       200



In [69]:
# Let's try with the GRADIENT BOOSTING CLASSIFIER

from sklearn.ensemble import GradientBoostingClassifier
clf=GradientBoostingClassifier(n_estimators=80,random_state=23)
clf.fit(X_train,y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=80,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=23, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [70]:
clf.score(X_test,y_test)

0.77

In [72]:
from sklearn.model_selection import GridSearchCV
mod=GridSearchCV(clf,param_grid={'n_estimators': [80,100]})

In [73]:
mod.fit(X_train,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=80,
                                                  n_iter_no_change=None,
       

In [74]:
mod.best_estimator_

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=23, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [76]:
clf=GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=23, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
clf.fit(X_train,y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=23, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [77]:
clf.score(X_test,y_test)

0.765