In [41]:
import pandas as pd
import nltk
nltk.download("vader_lexicon")
from nltk.sentiment.vader import SentimentIntensityAnalyzer



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [42]:
# load the data

data= pd.read_csv("/content/moviereviews.tsv",sep="\t")
data.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   2000 non-null   object
 1   review  1965 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [44]:
data.isnull().sum()

Unnamed: 0,0
label,0
review,35


In [45]:
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
neg,1000
pos,1000


In [46]:
# drop null values

data.dropna(inplace=True)

In [47]:
blanks=[]

for i ,lb,rv in data.itertuples():

    if type(rv)==str:
        if rv.isspace():
            blanks.append(i)

len(blanks)


27

In [48]:
# drop spaces

data.drop(blanks,inplace=True)


In [49]:
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
neg,969
pos,969


In [50]:
sid= SentimentIntensityAnalyzer()
sid

<nltk.sentiment.vader.SentimentIntensityAnalyzer at 0x7855001d8f40>

In [51]:
data['scores']= data['review'].apply(lambda review:sid.polarity_scores(review))
data.head()

Unnamed: 0,label,review,scores
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co..."


In [52]:
data['compound']=data['scores'].apply(lambda d:d['compound'])
data.head()

Unnamed: 0,label,review,scores,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484


In [53]:
data['scores'][1]

{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'compound': -0.8618}

In [54]:
data['comp_score']=data['compound'].apply(lambda score:"pos" if score>=0 else "neg")
data.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg


In [55]:
from sklearn.metrics import confusion_matrix,classification_report

In [56]:
confusion_matrix(data['label'],data['comp_score'])

array([[427, 542],
       [164, 805]])

In [57]:
report= classification_report(data['label'],data['comp_score'],output_dict=True)
report

{'neg': {'precision': 0.7225042301184433,
  'recall': 0.4406604747162023,
  'f1-score': 0.5474358974358975,
  'support': 969.0},
 'pos': {'precision': 0.5976243504083147,
  'recall': 0.8307533539731682,
  'f1-score': 0.6951640759930915,
  'support': 969.0},
 'accuracy': 0.6357069143446853,
 'macro avg': {'precision': 0.6600642902633791,
  'recall': 0.6357069143446852,
  'f1-score': 0.6212999867144945,
  'support': 1938.0},
 'weighted avg': {'precision': 0.660064290263379,
  'recall': 0.6357069143446853,
  'f1-score': 0.6212999867144945,
  'support': 1938.0}}

In [58]:
report=pd.DataFrame(report).transpose()
report

Unnamed: 0,precision,recall,f1-score,support
neg,0.722504,0.44066,0.547436,969.0
pos,0.597624,0.830753,0.695164,969.0
accuracy,0.635707,0.635707,0.635707,0.635707
macro avg,0.660064,0.635707,0.6213,1938.0
weighted avg,0.660064,0.635707,0.6213,1938.0


In [59]:
# not a good score for neg review.

# accuracy is also only 64%.