In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('moviereviews.tsv', sep='\t')
print(df)

     label                                             review
0      neg  how do films like mouse hunt get into theatres...
1      neg  some talented actresses are blessed with a dem...
2      pos  this has been an extraordinary year for austra...
3      pos  according to hollywood movies made in last few...
4      neg  my first press screening of 1998 and already i...
...    ...                                                ...
1995   pos  i like movies with albert brooks , and i reall...
1996   pos  it might surprise some to know that joel and e...
1997   pos  the verdict : spine-chilling drama from horror...
1998   pos  i want to correct what i wrote in a former ret...
1999   pos  a couple of months ago , when i first download...

[2000 rows x 2 columns]


In [5]:
row_num, col_num = df.shape
print()
print("The total number of rows in this movie reviews data frame: "+str(row_num)+"")
print("The total number of columns in this movie reviews data frame: "+str(col_num)+"")
print()


The total number of rows in this movie reviews data frame: 2000
The total number of columns in this movie reviews data frame: 2



In [6]:
df.isnull()

Unnamed: 0,label,review
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
1995,False,False
1996,False,False
1997,False,False
1998,False,False


In [7]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [23]:
# As we can see in the output, there are 35 reviews missing in the 'review' column.
print(df.isnull().sum())

label     0
review    0
dtype: int64


In [24]:
df.dropna(inplace=True)
print(df.isnull().sum())

In [25]:
blanks = list()
for index, label, review in df.itertuples():
    if type(review) == str and review.isspace():
        blanks.append(index)
        
print(blanks)

[]


In [26]:
df.drop(blanks, inplace = True)

In [27]:
print(df.isnull().sum())

label     0
review    0
dtype: int64


In [22]:
# We can either use df.drop(blank_positions, inplace=True) or df.dropna(inplace=True) method to drop the null 
# values from the data frame called 'df'.

In [28]:
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [29]:
# gets the total number of each label type in the 'label' column of the dataframe called 'df'
labels = df['label']
print(labels.value_counts())

neg    969
pos    969
Name: label, dtype: int64


In [31]:
neg_count = labels.value_counts()['neg']
pos_count = labels.value_counts()['pos']
print("The total number of negative reviews in the movie reviews data frame: "+str(neg_count)+"")
print("The total number of positive reviews in the movie reviews data frame: "+str(pos_count)+"")

The total number of negative reviews in the movie reviews data frame: 969
The total number of positive reviews in the movie reviews data frame: 969


In [34]:
# Perform sentiment analysis 

# Necessary imports
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Creating a 'SentimentIntensityAnalyzer' object
sia = SentimentIntensityAnalyzer()

df['scores'] = df['review'].apply(lambda review: sia.polarity_scores(review))
df['compound'] = df['scores'].apply(lambda score: score['compound'])

print(df)

     label                                             review  \
0      neg  how do films like mouse hunt get into theatres...   
1      neg  some talented actresses are blessed with a dem...   
2      pos  this has been an extraordinary year for austra...   
3      pos  according to hollywood movies made in last few...   
4      neg  my first press screening of 1998 and already i...   
...    ...                                                ...   
1995   pos  i like movies with albert brooks , and i reall...   
1996   pos  it might surprise some to know that joel and e...   
1997   pos  the verdict : spine-chilling drama from horror...   
1998   pos  i want to correct what i wrote in a former ret...   
1999   pos  a couple of months ago , when i first download...   

                                                 scores  compound  
0     {'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...   -0.9125  
1     {'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...   -0.8618  
2     {'neg': 0

In [37]:
df['predicted_sentiment_label'] = df['compound'].apply(lambda score: "neg" if score < 0 else "pos")

In [38]:
print(df.head())

  label                                             review  \
0   neg  how do films like mouse hunt get into theatres...   
1   neg  some talented actresses are blessed with a dem...   
2   pos  this has been an extraordinary year for austra...   
3   pos  according to hollywood movies made in last few...   
4   neg  my first press screening of 1998 and already i...   

                                              scores  compound  \
0  {'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...   -0.9125   
1  {'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...   -0.8618   
2  {'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...    0.9951   
3  {'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...    0.9972   
4  {'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...   -0.2484   

  predicted_sentiment_label  
0                       neg  
1                       neg  
2                       pos  
3                       pos  
4                       neg  


In [45]:
# Neccessary imports for the evaluation metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [46]:
# Performance evaluation on sentiment prediction

true_values = df['label']
predicted_values = df['predicted_sentiment_label']

accuracy = accuracy_score(true_values, predicted_values)
confusion_matrix = confusion_matrix(true_values, predicted_values)
classification_report = classification_report(true_values, predicted_values)

print("The overall accuracy is as below: ")
print()
print(accuracy)

print()
print("---------------------------------------------------------")

print("The confusion matrix is as below: ")
print()
print(confusion_matrix)

print()
print("---------------------------------------------------------")

print("The classification report is as below: ")
print()
print(classification_report)

print()
print("---------------------------------------------------------")

The overall accuracy is as below: 

0.6357069143446853

---------------------------------------------------------
The confusion matrix is as below: 

[[427 542]
 [164 805]]

---------------------------------------------------------
The classification report is as below: 

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938


---------------------------------------------------------
