## Model Prediction and Discussion on Balanced Dataset

The classification algorithms MultinomialNB (Multinomial Naive Bayes Classification) and the Linear Support Vector Classification (LinearSVC) will be used on movies data.

## 1. Libraries and loading preprocessed data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import warnings


import pickle 
#import mglearn
import time


from nltk.tokenize import TweetTokenizer # doesn't split at apostrophes
import nltk
from nltk import Text
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.neural_network import MLPClassifier



from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score
from sklearn.metrics import cohen_kappa_score

from sklearn.svm import LinearSVC

In [2]:
movies = pd.read_csv('movies_preprocessed.csv', delimiter=',')
# movies.dataframeName = 'wiki_movie_plots_deduped.csv'
movies = movies.drop(columns="id")
nRow, nCol = movies.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 34886 rows and 11 columns


In [3]:
movies.head()

Unnamed: 0,PlotClean,TitleClean,MainGenresCount,action,animation,comedy,crime,drama,musical,romance,thriller
0,a bartender is working at a saloon serving dr...,kansas saloon smashers,0,0,0,0,0,0,0,0,0
1,the moon painted with a smiling face hangs ov...,love by the light of the moon,0,0,0,0,0,0,0,0,0
2,the film just over a minute long is composed...,the martyred presidents,0,0,0,0,0,0,0,0,0
3,lasting just 61 seconds and consisting of two ...,terrible teddy the grizzly king,0,0,0,0,0,0,0,0,0
4,the earliest known adaptation of the classic f...,jack and the beanstalk,0,0,0,0,0,0,0,0,0


## 2. Feature Engineering

**Train and Test split**

In [22]:
# the train and the test data set will be build when there is at least one genre for a movie
MoviesTrain, MoviesTest = train_test_split(movies[movies.MainGenresCount!=0], random_state=42, test_size=0.05, shuffle=True)

In [23]:
MoviesTrain.head()

Unnamed: 0,PlotClean,TitleClean,MainGenresCount,action,animation,comedy,crime,drama,musical,romance,thriller
25422,when the city is rocked by a series of brutal ...,shart,2,0,0,0,0,1,0,0,1
25237,angad kunal kapoor is a confused teenager tr...,vijeta,1,0,0,0,0,1,0,0,0
6693,in march 1957 commander dr eldon galbraithe ...,world without end,1,1,0,0,0,0,0,0,0
22619,the story is set in jin a ducal state under t...,sacrifice,1,0,0,0,0,1,0,0,0
24714,akhtar hussain saves the life of nawab salim a...,mere huzoor,2,0,0,0,0,1,0,1,0


In [24]:
action = MoviesTrain[['PlotClean','action']]
animation = MoviesTrain[['PlotClean','animation']]
comedy = MoviesTrain[['PlotClean','comedy']]
crime = MoviesTrain[['PlotClean','crime']]
drama = MoviesTrain[['PlotClean','drama']]
musical = MoviesTrain[['PlotClean','musical']]
romance = MoviesTrain[['PlotClean','romance']]
thriller = MoviesTrain[['PlotClean','thriller']]

In [25]:
actionp = action[action.action!=0]
actionn = action[action.action==0]
actionp = actionp.sample(frac=1,random_state=1)
actionn = actionn.sample(frac=(len(actionp)/len(actionn)),random_state=1)
final_action = pd.concat([actionp,actionn],axis=0)
final_action = final_action.sample(frac=1).reset_index(drop=True)
final_action

Unnamed: 0,PlotClean,action
0,the movie revolves around two powerful gangs i...,1
1,obrien plays an irish mine worker barney slan...,0
2,american intelligence officials learn that sov...,1
3,during the battle of the bulge in world war ii...,1
4,dana scully a former fbi agent is now a staf...,1
...,...,...
11317,nagarahavu is about a woman transforming into ...,0
11318,the film is an allegorical campaign film desi...,1
11319,dastan a street urchin in persia is adopted ...,1
11320,nandha nandha durairaj is an upright sub-ins...,1


In [26]:
animationp = animation[animation.animation!=0]
animationn = animation[animation.animation==0]
animationp = animationp.sample(frac=1,random_state=1)
animationn = animationn.sample(frac=(len(animationp)/len(animationn)),random_state=1)
final_animation = pd.concat([animationp,animationn],axis=0)
final_animation = final_animation.sample(frac=1).reset_index(drop=True)
comedyp = comedy[comedy.comedy!=0]
comedyn = comedy[comedy.comedy==0]
comedyp = comedyp.sample(frac=1,random_state=1)
comedyn = comedyn.sample(frac=(len(comedyp)/len(comedyn)),random_state=1)
final_comedy = pd.concat([comedyp,comedyn],axis=0)
final_comedy = final_comedy.sample(frac=1).reset_index(drop=True)
crimep = crime[crime.crime!=0]
crimen = crime[crime.crime==0]
crimep = crimep.sample(frac=1,random_state=1)
crimen = crimen.sample(frac=(len(crimep)/len(crimen)),random_state=1)
final_crime = pd.concat([crimep,crimen],axis=0)
final_crime = final_crime.sample(frac=1).reset_index(drop=True)
dramap = drama[drama.drama!=0]
draman = drama[drama.drama==0]
dramap = dramap.sample(frac=1,random_state=1)
draman = draman.sample(frac=(len(dramap)/len(draman)),random_state=1)
final_drama = pd.concat([dramap,draman],axis=0)
final_drama = final_drama.sample(frac=1).reset_index(drop=True)
musicalp = musical[musical.musical!=0]
musicaln = musical[musical.musical==0]
musicalp = musicalp.sample(frac=1,random_state=1)
musicaln = musicaln.sample(frac=(len(musicalp)/len(musicaln)),random_state=1)
final_musical = pd.concat([musicalp,musicaln],axis=0)
final_musical = final_musical.sample(frac=1).reset_index(drop=True)
romancep = romance[romance.romance!=0]
romancen = romance[romance.romance==0]
romancep = romancep.sample(frac=1,random_state=1)
romancen = romancen.sample(frac=(len(romancep)/len(romancen)),random_state=1)
final_romance = pd.concat([romancep,romancen],axis=0)
final_romance = final_romance.sample(frac=1).reset_index(drop=True)
thrillerp = thriller[thriller.thriller!=0]
thrillern = thriller[thriller.thriller==0]
thrillerp = thrillerp.sample(frac=1,random_state=1)
thrillern = thrillern.sample(frac=(len(thrillerp)/len(thrillern)),random_state=1)
final_thriller = pd.concat([thrillerp,thrillern],axis=0)
final_thriller = final_thriller.sample(frac=1).reset_index(drop=True)

In [27]:
print('nrow of the MoviesTrain ={}'. format(len(final_action)))
print('nrow of the MoviesTest ={}'. format(len(MoviesTest)))

nrow of the MoviesTrain =11322
nrow of the MoviesTest =1317


**Building the classes**

In [28]:
# building the classes
y_train_action = final_action['action']
y_test_action = MoviesTest['action']
y_train_animation = final_animation['animation']
y_test_animation = MoviesTest['animation']
y_train_comedy = final_comedy['comedy']
y_test_comedy = MoviesTest['comedy']
y_train_drama = final_drama['drama']
y_test_drama = MoviesTest['drama']
y_train_crime = final_crime['crime']
y_test_crime = MoviesTest['crime']
y_train_musical = final_musical['musical']
y_test_musical = MoviesTest['musical']
y_train_romance = final_romance['romance']
y_test_romance = MoviesTest['romance']
y_train_thriller = final_thriller['thriller']
y_test_thriller = MoviesTest['thriller']

## 3. Genre Classification Result

### 3.1 Multinomial Naive Bayes Classification for Main Genres

In [11]:
accuracy_multinomialNB=pd.DataFrame(columns=['Genre', 'precision_multinomialNB','recall_multinomialNB'])
accuracy_multinomialNB.head()

Unnamed: 0,Genre,precision_multinomialNB,recall_multinomialNB


In [12]:
multinomialNB=OneVsRestClassifier(MultinomialNB(alpha=10,fit_prior=True, class_prior=None))

In [13]:
i = 0
# building the features
# definition the algorithm for feature extraction
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_action = tfidf.fit_transform(final_action.PlotClean)
x_test_action  = tfidf.transform(MoviesTest.PlotClean)
multinomialNB.fit(x_train_action, y_train_action)
# compute the testing accuracy for plot
accuracy_multinomialNB.loc[i,'Genre'] = 'action'
prediction = multinomialNB.predict(x_test_action)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_action)):
    TP += 1 if y_test_action.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_action.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_action.iloc[j]==1 and prediction[j]==0 else 0
# print('Precision: {}'.format(TP/(TP+FP)))
# print('Recall: {}'.format(TP/(TP+FN)))
accuracy_multinomialNB.loc[i,'precision_multinomialNB'] = TP/(TP+FP)
accuracy_multinomialNB.loc[i,'recall_multinomialNB'] = TP/(TP+FN)
i += 1

In [14]:
# animation
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_animation = tfidf.fit_transform(final_animation.PlotClean)
x_test_animation  = tfidf.transform(MoviesTest.PlotClean)
multinomialNB.fit(x_train_animation, y_train_animation)
# compute the testing accuracy for plot
accuracy_multinomialNB.loc[i,'Genre'] = 'animation'
prediction = multinomialNB.predict(x_test_animation)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_animation)):
    TP += 1 if y_test_animation.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_animation.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_animation.iloc[j]==1 and prediction[j]==0 else 0
# print('Precision: {}'.format(TP/(TP+FP)))
# print('Recall: {}'.format(TP/(TP+FN)))
accuracy_multinomialNB.loc[i,'precision_multinomialNB'] = TP/(TP+FP)
accuracy_multinomialNB.loc[i,'recall_multinomialNB'] = TP/(TP+FN)
i += 1

In [15]:
# comedy
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_comedy= tfidf.fit_transform(final_comedy.PlotClean)
x_test_comedy  = tfidf.transform(MoviesTest.PlotClean)
multinomialNB.fit(x_train_comedy, y_train_comedy)
# compute the testing accuracy for plot
accuracy_multinomialNB.loc[i,'Genre'] = 'comedy'
prediction = multinomialNB.predict(x_test_comedy)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_comedy)):
    TP += 1 if y_test_comedy.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_comedy.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_comedy.iloc[j]==1 and prediction[j]==0 else 0
# print('Precision: {}'.format(TP/(TP+FP)))
# print('Recall: {}'.format(TP/(TP+FN)))
accuracy_multinomialNB.loc[i,'precision_multinomialNB'] = TP/(TP+FP)
accuracy_multinomialNB.loc[i,'recall_multinomialNB'] = TP/(TP+FN)
i += 1

In [16]:
# crime
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_crime= tfidf.fit_transform(final_crime.PlotClean)
x_test_crime  = tfidf.transform(MoviesTest.PlotClean)
multinomialNB.fit(x_train_crime, y_train_crime)
# compute the testing accuracy for plot
accuracy_multinomialNB.loc[i,'Genre'] = 'crime'
prediction = multinomialNB.predict(x_test_crime)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_crime)):
    TP += 1 if y_test_crime.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_crime.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_crime.iloc[j]==1 and prediction[j]==0 else 0
# print('Precision: {}'.format(TP/(TP+FP)))
# print('Recall: {}'.format(TP/(TP+FN)))
accuracy_multinomialNB.loc[i,'precision_multinomialNB'] = TP/(TP+FP)
accuracy_multinomialNB.loc[i,'recall_multinomialNB'] = TP/(TP+FN)
i += 1

In [17]:
# drama
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_drama= tfidf.fit_transform(final_drama.PlotClean)
x_test_drama  = tfidf.transform(MoviesTest.PlotClean)
multinomialNB.fit(x_train_drama, y_train_drama)
# compute the testing accuracy for plot
accuracy_multinomialNB.loc[i,'Genre'] = 'drama'
prediction = multinomialNB.predict(x_test_drama)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_drama)):
    TP += 1 if y_test_drama.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_drama.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_drama.iloc[j]==1 and prediction[j]==0 else 0
# print('Precision: {}'.format(TP/(TP+FP)))
# print('Recall: {}'.format(TP/(TP+FN)))
accuracy_multinomialNB.loc[i,'precision_multinomialNB'] = TP/(TP+FP)
accuracy_multinomialNB.loc[i,'recall_multinomialNB'] = TP/(TP+FN)
i += 1

In [18]:
# musical
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_musical= tfidf.fit_transform(final_musical.PlotClean)
x_test_musical  = tfidf.transform(MoviesTest.PlotClean)
multinomialNB.fit(x_train_musical, y_train_musical)
# compute the testing accuracy for plot
accuracy_multinomialNB.loc[i,'Genre'] = 'musical'
prediction = multinomialNB.predict(x_test_musical)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_musical)):
    TP += 1 if y_test_musical.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_musical.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_musical.iloc[j]==1 and prediction[j]==0 else 0
# print('Precision: {}'.format(TP/(TP+FP)))
# print('Recall: {}'.format(TP/(TP+FN)))
accuracy_multinomialNB.loc[i,'precision_multinomialNB'] = TP/(TP+FP)
accuracy_multinomialNB.loc[i,'recall_multinomialNB'] = TP/(TP+FN)
i += 1

In [19]:
# romance
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_romance= tfidf.fit_transform(final_romance.PlotClean)
x_test_romance  = tfidf.transform(MoviesTest.PlotClean)
multinomialNB.fit(x_train_romance, y_train_romance)
# compute the testing accuracy for plot
accuracy_multinomialNB.loc[i,'Genre'] = 'romance'
prediction = multinomialNB.predict(x_test_romance)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_romance)):
    TP += 1 if y_test_romance.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_romance.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_romance.iloc[j]==1 and prediction[j]==0 else 0
# print('Precision: {}'.format(TP/(TP+FP)))
# print('Recall: {}'.format(TP/(TP+FN)))
accuracy_multinomialNB.loc[i,'precision_multinomialNB'] = TP/(TP+FP)
accuracy_multinomialNB.loc[i,'recall_multinomialNB'] = TP/(TP+FN)
i += 1

In [20]:
# thriller
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_thriller= tfidf.fit_transform(final_thriller.PlotClean)
x_test_thriller  = tfidf.transform(MoviesTest.PlotClean)
multinomialNB.fit(x_train_thriller, y_train_thriller)
# compute the testing accuracy for plot
accuracy_multinomialNB.loc[i,'Genre'] = 'thriller'
prediction = multinomialNB.predict(x_test_thriller)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_thriller)):
    TP += 1 if y_test_thriller.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_thriller.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_thriller.iloc[j]==1 and prediction[j]==0 else 0
# print('Precision: {}'.format(TP/(TP+FP)))
# print('Recall: {}'.format(TP/(TP+FN)))
accuracy_multinomialNB.loc[i,'precision_multinomialNB'] = TP/(TP+FP)
accuracy_multinomialNB.loc[i,'recall_multinomialNB'] = TP/(TP+FN)
i += 1

In [21]:
accuracy_multinomialNB

Unnamed: 0,Genre,precision_multinomialNB,recall_multinomialNB
0,action,0.793814,0.542254
1,animation,0.94,0.261111
2,comedy,0.757062,0.440789
3,crime,0.837209,0.171021
4,drama,0.778252,0.536765
5,musical,0.871795,0.0817308
6,romance,0.712963,0.229167
7,thriller,0.773196,0.35461


## 3.2 Logistic Regression for Main Genres

In [30]:
import warnings
warnings.filterwarnings('ignore')

In [31]:
accuracy_LR=pd.DataFrame(columns=['Genre', 'precision_LR','recall_LR'])
accuracy_LR.head()

Unnamed: 0,Genre,precision_LR,recall_LR


In [32]:
LR=OneVsRestClassifier(LogisticRegression(max_iter=12,C=0.9,n_jobs=-1), n_jobs=1)

In [33]:
i = 0
# action
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_action = tfidf.fit_transform(final_action.PlotClean)
x_test_action  = tfidf.transform(MoviesTest.PlotClean)
LR.fit(x_train_action, y_train_action)
# compute the testing accuracy for plot
accuracy_LR.loc[i,'Genre'] = 'action'
prediction = LR.predict(x_test_action)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_action)):
    TP += 1 if y_test_action.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_action.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_action.iloc[j]==1 and prediction[j]==0 else 0
accuracy_LR.loc[i,'precision_LR'] = TP/(TP+FP)
accuracy_LR.loc[i,'recall_LR'] = TP/(TP+FN)
i += 1

In [34]:
# animation
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_animation = tfidf.fit_transform(final_animation.PlotClean)
x_test_animation  = tfidf.transform(MoviesTest.PlotClean)
LR.fit(x_train_animation, y_train_animation)
# compute the testing accuracy for plot
accuracy_LR.loc[i,'Genre'] = 'animation'
prediction = LR.predict(x_test_animation)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_animation)):
    TP += 1 if y_test_animation.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_animation.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_animation.iloc[j]==1 and prediction[j]==0 else 0
accuracy_LR.loc[i,'precision_LR'] = TP/(TP+FP)
accuracy_LR.loc[i,'recall_LR'] = TP/(TP+FN)
i += 1

In [35]:
# comedy
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_comedy= tfidf.fit_transform(final_comedy.PlotClean)
x_test_comedy  = tfidf.transform(MoviesTest.PlotClean)
LR.fit(x_train_comedy, y_train_comedy)
# compute the testing accuracy for plot
accuracy_LR.loc[i,'Genre'] = 'comedy'
prediction = LR.predict(x_test_comedy)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_comedy)):
    TP += 1 if y_test_comedy.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_comedy.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_comedy.iloc[j]==1 and prediction[j]==0 else 0
accuracy_LR.loc[i,'precision_LR'] = TP/(TP+FP)
accuracy_LR.loc[i,'recall_LR'] = TP/(TP+FN)
i += 1

In [36]:
# crime
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_crime= tfidf.fit_transform(final_crime.PlotClean)
x_test_crime  = tfidf.transform(MoviesTest.PlotClean)
LR.fit(x_train_crime, y_train_crime)
# compute the testing accuracy for plot
accuracy_LR.loc[i,'Genre'] = 'crime'
prediction = LR.predict(x_test_crime)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_crime)):
    TP += 1 if y_test_crime.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_crime.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_crime.iloc[j]==1 and prediction[j]==0 else 0
accuracy_LR.loc[i,'precision_LR'] = TP/(TP+FP)
accuracy_LR.loc[i,'recall_LR'] = TP/(TP+FN)
i += 1

In [37]:
# drama
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_drama= tfidf.fit_transform(final_drama.PlotClean)
x_test_drama  = tfidf.transform(MoviesTest.PlotClean)
LR.fit(x_train_drama, y_train_drama)
# compute the testing accuracy for plot
accuracy_LR.loc[i,'Genre'] = 'drama'
prediction = LR.predict(x_test_drama)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_drama)):
    TP += 1 if y_test_drama.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_drama.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_drama.iloc[j]==1 and prediction[j]==0 else 0
accuracy_LR.loc[i,'precision_LR'] = TP/(TP+FP)
accuracy_LR.loc[i,'recall_LR'] = TP/(TP+FN)
i += 1

In [38]:
# musical
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_musical= tfidf.fit_transform(final_musical.PlotClean)
x_test_musical  = tfidf.transform(MoviesTest.PlotClean)
LR.fit(x_train_musical, y_train_musical)
# compute the testing accuracy for plot
accuracy_LR.loc[i,'Genre'] = 'musical'
prediction = LR.predict(x_test_musical)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_musical)):
    TP += 1 if y_test_musical.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_musical.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_musical.iloc[j]==1 and prediction[j]==0 else 0
accuracy_LR.loc[i,'precision_LR'] = TP/(TP+FP)
accuracy_LR.loc[i,'recall_LR'] = TP/(TP+FN)
i += 1

In [39]:
# romance
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_romance= tfidf.fit_transform(final_romance.PlotClean)
x_test_romance  = tfidf.transform(MoviesTest.PlotClean)
LR.fit(x_train_romance, y_train_romance)
# compute the testing accuracy for plot
accuracy_LR.loc[i,'Genre'] = 'romance'
prediction = LR.predict(x_test_romance)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_romance)):
    TP += 1 if y_test_romance.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_romance.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_romance.iloc[j]==1 and prediction[j]==0 else 0
accuracy_LR.loc[i,'precision_LR'] = TP/(TP+FP)
accuracy_LR.loc[i,'recall_LR'] = TP/(TP+FN)
i += 1

In [40]:
# thriller
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
x_train_thriller= tfidf.fit_transform(final_thriller.PlotClean)
x_test_thriller  = tfidf.transform(MoviesTest.PlotClean)
LR.fit(x_train_thriller, y_train_thriller)
# compute the testing accuracy for plot
accuracy_LR.loc[i,'Genre'] = 'thriller'
prediction = LR.predict(x_test_thriller)
TP = 0
FN = 0
FP = 0
for j in range(len(y_test_thriller)):
    TP += 1 if y_test_thriller.iloc[j]==1 and prediction[j]==1 else 0
    FN += 1 if y_test_thriller.iloc[j]==0 and prediction[j]==1 else 0
    FP += 1 if y_test_thriller.iloc[j]==1 and prediction[j]==0 else 0
accuracy_LR.loc[i,'precision_LR'] = TP/(TP+FP)
accuracy_LR.loc[i,'recall_LR'] = TP/(TP+FN)
i += 1

In [41]:
accuracy_LR

Unnamed: 0,Genre,precision_LR,recall_LR
0,action,0.766323,0.501124
1,animation,0.82,0.297101
2,comedy,0.720339,0.462795
3,crime,0.825581,0.181122
4,drama,0.739872,0.566069
5,musical,0.820513,0.109966
6,romance,0.731481,0.212938
7,thriller,0.78866,0.414634
