# Movie Review Classification

Goal it to check whether movie review is positive/negative using dataset.

In [100]:
# Import libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix,classification_report


In [101]:
# Load the dataset

data = pd.read_csv("/content/moviereviews.tsv",sep="\t")
data.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [102]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   2000 non-null   object
 1   review  1965 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [103]:
data['review'][1]


'some talented actresses are blessed with a demonstrated wide acting range while others , almost as gifted , have more limited types of parts for which they are suitable . \r\nas was amply evident after basic instinct , sharon stone can play sensual roles with great abandon . \r\nrejecting her natural abilities , she has spent the rest of her entire career trying with little success to play against type . \r\ngloria is her latest disaster . \r\nbabe ruth didn\'t quit baseball after one season to play football in a quixotic quest to prove his athletic dexterity , and neither should stone reject what she does best . \r\njaneane garofalo , for example , is no less wonderful an actress because she could have never pulled off stone\'s part in basic instinct ; neither is stone any less talented because she couldn\'t do garofalo\'s comedic roles . \r\ngloria , directed by respected director sidney lumet and adapted by steve antin from the 1980 screenplay by john cassavetes , was not screened 

In [104]:
# Check for missing values

data.isnull().sum()

Unnamed: 0,0
label,0
review,35


In [105]:
# remove missing values

data.dropna(inplace=True)
# remove permenant


In [106]:

data.isnull().sum()

Unnamed: 0,0
label,0
review,0


In [107]:
# Example to check empty sting

empty = '  '
empty.isspace()

True

In [108]:
blanks = []

#(index,label,review text)

for i,lb,rb in data.itertuples():

    if rb.isspace():
        blanks.append(i)

print(len(blanks))

# 27 places where reviews are empty string



27


In [109]:
# drop empty reviews.

data.drop(blanks,inplace=True)


In [110]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1938 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1938 non-null   object
 1   review  1938 non-null   object
dtypes: object(2)
memory usage: 45.4+ KB


In [111]:
X = data['review']
y = data['label']

In [112]:
# divide data in train and test

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,
                                                 random_state=42)



In [113]:
X_train.shape

(1356,)

In [114]:
X_test.shape

(582,)

In [115]:
y_train.shape

(1356,)

In [116]:
y_test.shape

(582,)

In [117]:
# build a pipeline

pipeline= Pipeline([("tfidf",TfidfVectorizer()),
                    ("model",LinearSVC())])
pipeline.fit(X_train,y_train)

In [118]:
pred = pipeline.predict(X_test)
pred

array(['neg', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg',
       'neg', 'neg', 'neg', 'pos', 'neg', 'pos', 'neg', 'neg', 'pos',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg',
       'pos', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'neg', 'pos',
       'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'neg',
       'neg', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'pos',
       'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'neg',
       'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos',
       'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos',
       'pos', 'pos', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos',
       'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos',
       'pos', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos',
       'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos',
       'pos', 'pos',

In [119]:
# confusion matrix

cm = confusion_matrix(y_test,pred)
cm

array([[235,  47],
       [ 41, 259]])

In [120]:
report= classification_report(y_test,pred,output_dict=True)
report

{'neg': {'precision': 0.8514492753623188,
  'recall': 0.8333333333333334,
  'f1-score': 0.8422939068100358,
  'support': 282.0},
 'pos': {'precision': 0.8464052287581699,
  'recall': 0.8633333333333333,
  'f1-score': 0.8547854785478548,
  'support': 300.0},
 'accuracy': 0.8487972508591065,
 'macro avg': {'precision': 0.8489272520602444,
  'recall': 0.8483333333333334,
  'f1-score': 0.8485396926789452,
  'support': 582.0},
 'weighted avg': {'precision': 0.8488492513395617,
  'recall': 0.8487972508591065,
  'f1-score': 0.8487328613140662,
  'support': 582.0}}

In [121]:
report= pd.DataFrame(report).transpose()
report

Unnamed: 0,precision,recall,f1-score,support
neg,0.851449,0.833333,0.842294,282.0
pos,0.846405,0.863333,0.854785,300.0
accuracy,0.848797,0.848797,0.848797,0.848797
macro avg,0.848927,0.848333,0.84854,582.0
weighted avg,0.848849,0.848797,0.848733,582.0


In [122]:
# LinearSVC model gives 84% accuracy

review="Singham Again is a must-watch for fans of the action genre and the cop universe that Rohit Shetty has so meticulously built. It’s unapologetically loud, patriotic, and larger-than-life, offering fans everything they expect and more. Despite a few predictable moments, the film’s sheer energy and visual spectacle make up for it, delivering a thoroughly entertaining experience."
review

'Singham Again is a must-watch for fans of the action genre and the cop universe that Rohit Shetty has so meticulously built. It’s unapologetically loud, patriotic, and larger-than-life, offering fans everything they expect and more. Despite a few predictable moments, the film’s sheer energy and visual spectacle make up for it, delivering a thoroughly entertaining experience.'

In [123]:
pipeline.predict([review])

array(['pos'], dtype=object)

In [124]:
review="Last year I loved the delightful indie zombie Christmas musical Anna and the Apocalypse. It was creative, funny and sweet. It even made Best Films of 2018. So when I saw a trailer for The Dead Don’t Die I got very excited. The trailer was hilarious, and I love the cast. My hope was it was going to be a Wes Anderson meets Zombieland film. Unfortunately I was very disappointed. The Dead Don’t Die was an unfunny, self-indulgent, frustrating experience. The cast is woefully wasted, and they strain for the few laughs the script offers. The metaphors are also rammed in to the ground by narrators and characters breaking the 4th wall. It is my first film from director Jim Jarmusch but the critics at Cannes and other places don’t seem to be enjoying it either, so it appears to be a just a big miss. It’s frustrating because it had so much potential and it all falls flat."
review

'Last year I loved the delightful indie zombie Christmas musical Anna and the Apocalypse. It was creative, funny and sweet. It even made Best Films of 2018. So when I saw a trailer for The Dead Don’t Die I got very excited. The trailer was hilarious, and I love the cast. My hope was it was going to be a Wes Anderson meets Zombieland film. Unfortunately I was very disappointed. The Dead Don’t Die was an unfunny, self-indulgent, frustrating experience. The cast is woefully wasted, and they strain for the few laughs the script offers. The metaphors are also rammed in to the ground by narrators and characters breaking the 4th wall. It is my first film from director Jim Jarmusch but the critics at Cannes and other places don’t seem to be enjoying it either, so it appears to be a just a big miss. It’s frustrating because it had so much potential and it all falls flat.'

In [125]:
pipeline.predict([review])

array(['neg'], dtype=object)