# Sentiment Analysis of IMDB Reviews

USing SciKit Learn to predict

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
df = pd.read_csv('./IMDB Dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
df.columns = ['Review', 'Sentiment']

In [7]:
df.head()

Unnamed: 0,Review,Sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Prep Data

In [8]:
X = df['Review']
y = df['Sentiment']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
X_train

23990    Randolph Scott is heading into Albuquerque to ...
8729     I like this movie cause it has a good approach...
3451     Well don't expect anything deep an meaningful....
2628     This really should deserve a "O" rating, or ev...
38352    Dwight Frye steals the show in this one as a f...
                               ...                        
11284    `Shadow Magic' recaptures the joy and amazemen...
44732    I found this movie to be quite enjoyable and f...
38158    Avoid this one! It is a terrible movie. So wha...
860      This production was quite a surprise for me. I...
15795    This is a decent movie. Although little bit sh...
Name: Review, Length: 33500, dtype: object

In [11]:
vectorizer = CountVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)

test_x_vectors = vectorizer.transform(X_test)

In [12]:
print(X_train_vectors[0])

  (0, 61653)	2
  (0, 67061)	12
  (0, 39762)	10
  (0, 34841)	1
  (0, 39385)	2
  (0, 2912)	1
  (0, 77357)	13
  (0, 75305)	1
  (0, 40721)	2
  (0, 84561)	4
  (0, 35829)	4
  (0, 79753)	7
  (0, 36711)	3
  (0, 54198)	2
  (0, 76482)	27
  (0, 83352)	2
  (0, 76590)	4
  (0, 72266)	1
  (0, 35137)	1
  (0, 81042)	1
  (0, 26142)	1
  (0, 76792)	2
  (0, 76634)	1
  (0, 4748)	2
  (0, 53207)	3
  :	:
  (0, 70451)	1
  (0, 28167)	1
  (0, 71058)	1
  (0, 52476)	1
  (0, 66456)	1
  (0, 7636)	1
  (0, 21159)	1
  (0, 2136)	1
  (0, 36698)	1
  (0, 17439)	1
  (0, 60517)	1
  (0, 6046)	1
  (0, 32520)	1
  (0, 11502)	1
  (0, 42050)	1
  (0, 67438)	1
  (0, 31444)	1
  (0, 45141)	1
  (0, 65406)	1
  (0, 12595)	1
  (0, 66664)	1
  (0, 83925)	1
  (0, 53303)	1
  (0, 37895)	1
  (0, 27699)	1


## Classification

Using different ...

#### Linear SVM

-> 'X_train_vectors' is large, the model takes too long to train. 

In [13]:
#from sklearn import svm

#df_svm = svm.SVC(kernel='linear')

#df_svm.fit(X_train_vectors, y_train)


In [14]:

#df_svm.predict(test_x_vectors)

#### Decision Tree

In [15]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(X_train_vectors, y_train)

In [16]:
clf_dec.predict(test_x_vectors)

array(['positive', 'positive', 'negative', ..., 'positive', 'positive',
       'negative'], dtype=object)

#### Naive Bayes

-> 'X_train_vectors' is large, the model uses too much memory to train. 

In [17]:
#from sklearn.naive_bayes import GaussianNB

#clf_gnb = GaussianNB()
#clf_gnb.fit(X_train_vectors.toarray(), y_train)

In [18]:
#clf_gnb.predict(test_x_vectors)

##### Logistic Regression with a Standard Scaler

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


clf_log = make_pipeline(StandardScaler(with_mean=False), LogisticRegression())
clf_log.fit(X_train_vectors, y_train)

In [20]:
clf_log.predict(test_x_vectors)

array(['negative', 'positive', 'negative', ..., 'positive', 'positive',
       'negative'], dtype=object)

## Evaluation

In [21]:
# Mean Accuracy
print(clf_dec.score(test_x_vectors, y_test))
print(clf_log.score(test_x_vectors, y_test))


0.7266666666666667
0.8635151515151516


In [25]:
# F1 scores
from sklearn.metrics import f1_score

f1_score(y_test, clf_dec.predict(test_x_vectors), average=None, labels=['positive', 'negative'])


array([0.72984306, 0.72341469])

In [26]:
f1_score(y_test, clf_log.predict(test_x_vectors), average=None, labels=['positive', 'negative'])

array([0.86508507, 0.86190827])

We'll use the logistic regression model with a standard scaler due to the better results from the trainig being produced

In [28]:
test_set = ['I felt like this was worth the wait', 'I genuinely wasted my time watching this', 'Why did he take that shot. Is that a shot he normally practices']
new_test = vectorizer.transform(test_set)

clf_log.predict(new_test)

array(['positive', 'negative', 'negative'], dtype=object)