In [28]:
# necessary imports 
import numpy as np
import pandas as pd

# reading the moviereviews2.tsv file
movie_reviews_df = pd.read_csv("moviereviews2.tsv", sep="\t")
print(movie_reviews_df)

     label                                             review
0      pos  I loved this movie and will watch it again. Or...
1      pos  A warm, touching movie that has a fantasy-like...
2      pos  I was not expecting the powerful filmmaking ex...
3      neg  This so-called "documentary" tries to tell tha...
4      pos  This show has been my escape from reality for ...
...    ...                                                ...
5995   pos  Of the three remakes of this plot, I like them...
5996   neg  Poor Whoopi Goldberg. Imagine her at a friend'...
5997   neg  Honestly before I watched this movie, I had he...
5998   pos  This movie is essentially shot on a hand held ...
5999   pos  It has singing. It has drama. It has comedy. I...

[6000 rows x 2 columns]


In [29]:
print(len(movie_reviews_df))
row_num, column_num = movie_reviews_df.shape
print()
print("Row number: "+str(row_num)+", Column number: "+str(column_num)+"")
print("There are "+str(row_num)+" movies in the movie reviews data frame.")

6000

Row number: 6000, Column number: 2
There are 6000 movies in the movie reviews data frame.


In [30]:
movie_reviews_df.isnull()

Unnamed: 0,label,review
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
5995,False,False
5996,False,False
5997,False,False
5998,False,False


In [31]:
print(movie_reviews_df.isnull())

      label  review
0     False   False
1     False   False
2     False   False
3     False   False
4     False   False
...     ...     ...
5995  False   False
5996  False   False
5997  False   False
5998  False   False
5999  False   False

[6000 rows x 2 columns]


In [32]:
# checks for nan values
movie_reviews_df.isnull().sum()

label      0
review    20
dtype: int64

In [33]:
# dropping the nan values
movie_reviews_df.dropna(inplace=True)

In [34]:
movie_reviews_df.isnull().sum()

label     0
review    0
dtype: int64

In [35]:
# check for whitespace string for both labels and movie reviews

blanks = list()
for index, label, review in movie_reviews_df.itertuples():
    if review.isspace():
        blanks.append(index)
        
print(blanks)
print("There are "+str(len(blanks))+" whitespace strings in the movie reviews data frame.")

[]
There are 0 whitespace strings in the movie reviews data frame.


In [36]:
movie_labels = movie_reviews_df['label']
label_counts = movie_labels.value_counts()
print(label_counts)

pos    2990
neg    2990
Name: label, dtype: int64


In [37]:
print("There are "+str(label_counts['pos'])+" positive movies in the movie reviews data frame.")
print("There are "+str(label_counts['neg'])+" negative movies in the movie reviews data frame.")

There are 2990 positive movies in the movie reviews data frame.
There are 2990 negative movies in the movie reviews data frame.


In [38]:
from sklearn.model_selection import train_test_split

X = movie_reviews_df['review']
y = movie_reviews_df['label']

# Dividing the entire dataset into the training set and test set. Allocating 33 percent of the data to the 
# test set and 67 percent of the data to the training set. The 'random_state' parameter is set to get the 
# same result in different splits.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 40)

# displaying the X_train, X_test, y_train and y_test data 
print("-------------------------------------------------")
print("The X_train data is as follows: ")
print()
print(X_train)
print("-------------------------------------------------")
print("The X_test data is as follows: ")
print()
print(X_test)
print("-------------------------------------------------")
print("The y_train data is as follows: ")
print()
print(y_train)
print("-------------------------------------------------")
print("The y_test data is as follows: ")
print()
print(y_test)
print("-------------------------------------------------")

-------------------------------------------------
The X_train data is as follows: 

453     Weak Bobby "Pineapple Salsa" Flay and Mario Ba...
5974    While some performances were good-Victoria Row...
5964    THE SECRET OF KELLS may be the most exquisite ...
680     To me movies and acting is all about telling a...
3659    I've seen this movie at theater when it first ...
                              ...                        
5979    The movie seemed to appeal me because of the n...
3350    This is a so called 'feel-good' movies, howeve...
5445    I became more emotionally attached to this mov...
3072    First, before reading further, you must unders...
3409    This is one of the worst films ever. I like ch...
Name: review, Length: 4006, dtype: object
-------------------------------------------------
The X_test data is as follows: 

1391    I was interested to see the move thinking that...
4802    One of the best records of Israel's response t...
5136    This is a well-worn story abo

In [40]:
# Building a pipeline to vectorize the data, and training & fitting a machine learning (ML) model.

from sklearn.pipeline import Pipeline 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC


text_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('svc', LinearSVC())])

# fit the data 
text_pipeline.fit(X_train, y_train) 

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('svc', LinearSVC())])

In [41]:
print(text_pipeline)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('svc', LinearSVC())])


In [43]:
# Form the predictions 
list_of_predictions = text_pipeline.predict(X_test)

In [48]:
# Evaluate the performance of the linear support vector classifier model
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report

# displaying the confusion matrix 
print("The confusion matrix is as follows: ")
print()
confusion_matrix = confusion_matrix(y_test, list_of_predictions)
print(confusion_matrix)


print()
print("-----------------------------------------------------------")

# displaying the accuracy score 
print("The overall accuracy score is as follows: ")
print()
accuracy_score = accuracy_score(y_test, list_of_predictions)
print(accuracy_score)

print()
print("-----------------------------------------------------------")

# displaying the classification report 
print("The classification report is as follows: ")
print()
classification_report = classification_report(y_test, list_of_predictions)
print(classification_report)

print()
print("------------------------------------------------------------")

The confusion matrix is as follows: 

[[885  73]
 [ 67 949]]

-----------------------------------------------------------
The overall accuracy score is as follows: 

0.9290780141843972

-----------------------------------------------------------
The classification report is as follows: 

              precision    recall  f1-score   support

         neg       0.93      0.92      0.93       958
         pos       0.93      0.93      0.93      1016

    accuracy                           0.93      1974
   macro avg       0.93      0.93      0.93      1974
weighted avg       0.93      0.93      0.93      1974


------------------------------------------------------------
