## Model Prediction on Unbalanced Dataset

The classification algorithms MultinomialNB (Multinomial Naive Bayes Classification) and the Linear Support Vector Classification (LinearSVC) will be used on movies data.

## 1. Libraries and loading preprocessed data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import warnings


import pickle 
#import mglearn
import time


from nltk.tokenize import TweetTokenizer # doesn't split at apostrophes
import nltk
from nltk import Text
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.neural_network import MLPClassifier



from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score
from sklearn.metrics import cohen_kappa_score

from sklearn.svm import LinearSVC

In [2]:
movies = pd.read_csv('movies_preprocessed.csv', delimiter=',')
# movies.dataframeName = 'wiki_movie_plots_deduped.csv'
movies = movies.drop(columns="id")
nRow, nCol = movies.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 34886 rows and 11 columns


In [3]:
movies.head()

Unnamed: 0,PlotClean,TitleClean,MainGenresCount,action,animation,comedy,crime,drama,musical,romance,thriller
0,a bartender is working at a saloon serving dr...,kansas saloon smashers,0,0,0,0,0,0,0,0,0
1,the moon painted with a smiling face hangs ov...,love by the light of the moon,0,0,0,0,0,0,0,0,0
2,the film just over a minute long is composed...,the martyred presidents,0,0,0,0,0,0,0,0,0
3,lasting just 61 seconds and consisting of two ...,terrible teddy the grizzly king,0,0,0,0,0,0,0,0,0
4,the earliest known adaptation of the classic f...,jack and the beanstalk,0,0,0,0,0,0,0,0,0


## 2. Feature Engineering

**Train and Test split**

In [4]:
# the train and the test data set will be build when there is at least one genre for a movie
MoviesTrain, MoviesTest = train_test_split(movies[movies.MainGenresCount!=0], random_state=42, test_size=0.20, shuffle=True)

In [5]:
MoviesTrain.head()

Unnamed: 0,PlotClean,TitleClean,MainGenresCount,action,animation,comedy,crime,drama,musical,romance,thriller
30733,the story is told through the protagonist muru...,veyil,1,0,0,0,0,1,0,0,0
1040,eddie haskins lease a wisecracking young ma...,troopers three,1,0,0,1,0,0,0,0,0
16958,five days after the assault on the abnegation ...,the divergent series: insurgent,1,1,0,0,0,0,0,0,0
5844,tom is busy designing a mousetrap in the attic...,designs on jerry,1,0,1,0,0,0,0,0,0
2040,alan colby heir to a vast fortune reappears ...,charlie chan secret,1,0,0,1,0,0,0,0,0


**Features**

In [6]:
# definition the algorithm for feature extraction
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')

In [7]:
# building the features
x_train = tfidf.fit_transform(MoviesTrain.PlotClean) 
x_test  = tfidf.transform(MoviesTest.PlotClean)
### for test data, the feature extraction will be done through the function transform()
### to make sure there is no features dimensionality mismatch

In [8]:
print('nrow of the MoviesTrain ={}'. format(MoviesTrain.shape[0]))
print('nrow of the MoviesTest ={}'. format(MoviesTest.shape[0]))

nrow of the MoviesTrain =21065
nrow of the MoviesTest =5267


**Building the classes**

In [9]:
# building the classes
y_train = MoviesTrain[MoviesTrain.columns[3:]]
y_test = MoviesTest[MoviesTest.columns[3:]]

In [10]:
print('number of y_train classes',len(y_train.columns))
print('number of y_test classes',len(y_test.columns))

number of y_train classes 8
number of y_test classes 8


## 3. Genre Classification Result

### 3.1 Multinomial Naive Bayes Classification for main genres

In [11]:
mainGenres = ['drama','comedy','action','thriller','romance','crime','musical','animation']

In [22]:
accuracy_multinomialNB=pd.DataFrame(columns=['Genre', 'precision_multinomialNB','recall_multinomialNB'])
accuracy_multinomialNB.head()
multinomialNB=OneVsRestClassifier(MultinomialNB(alpha=10,fit_prior=True, class_prior=None))

In [23]:
i=0
for genre in mainGenres:
    multinomialNB.fit(x_train, y_train[genre])
    prediction = multinomialNB.predict(x_test)
    accuracy_multinomialNB.loc[i,'Genre'] = genre
#     accuracy_multinomialNB.loc[i,'accuracy_multinomialNB'] = accuracy_score(y_test[genre], prediction)
    TP = 0
    FN = 0
    FP = 0
    for j in range(len(y_test)):
        TP += 1 if y_test[genre].iloc[j]==1 and prediction[j]==1 else 0
        FN += 1 if y_test[genre].iloc[j]==0 and prediction[j]==1 else 0
        FP += 1 if y_test[genre].iloc[j]==1 and prediction[j]==0 else 0
    accuracy_multinomialNB.loc[i,'precision_multinomialNB'] = TP/(TP+FP)
    accuracy_multinomialNB.loc[i,'recall_multinomialNB'] = TP/(TP+FN)
    i += 1
    

    
accuracy_multinomialNB

Unnamed: 0,Genre,precision_multinomialNB,recall_multinomialNB
0,drama,0.665409,0.558066
1,comedy,0.649966,0.544835
2,action,0.6473,0.628776
3,thriller,0.488435,0.5781
4,romance,0.501027,0.297199
5,crime,0.1875,0.394737
6,musical,0.0735294,0.227273
7,animation,0.546448,0.653595


### 3.2 Logistic Regression for main genres

In [24]:
LR=OneVsRestClassifier(LogisticRegression(max_iter=12,C=0.9,n_jobs=-1), n_jobs=1)
accuracy_LR=pd.DataFrame(columns=['Genre', 'precision_LR','recall_LR'])
accuracy_LR.head()

Unnamed: 0,Genre,precision_LR,recall_LR


In [18]:
import warnings
warnings.filterwarnings('ignore')

In [25]:
i = 0
for genre in mainGenres:
    LR.fit(x_train, y_train[genre])
    prediction = LR.predict(x_test)
    accuracy_LR.loc[i,'Genre'] = genre
#     accuracy_LR.loc[i,'accuracy_LR'] = accuracy_score(y_test[genre], prediction)
    TP = 0
    FN = 0
    FP = 0
    for j in range(len(y_test)):
        TP += 1 if y_test[genre].iloc[j]==1 and prediction[j]==1 else 0
        FN += 1 if y_test[genre].iloc[j]==0 and prediction[j]==1 else 0
        FP += 1 if y_test[genre].iloc[j]==1 and prediction[j]==0 else 0
    accuracy_LR.loc[i,'precision_LR'] = TP/(TP+FP)
    accuracy_LR.loc[i,'recall_LR'] = TP/(TP+FN)
    i += 1
accuracy_LR

Unnamed: 0,Genre,precision_LR,recall_LR
0,drama,0.610453,0.591028
1,comedy,0.548409,0.660147
2,action,0.599836,0.698095
3,thriller,0.472109,0.709611
4,romance,0.351129,0.446475
5,crime,0.2875,0.398268
6,musical,0.294118,0.487805
7,animation,0.453552,0.721739
