### Data loading and preprocessing

In [1]:
import pandas as pd

df = pd.read_csv('movie_plots_yts.csv',index_col=0)
df.shape

(33812, 3)

In [2]:
def prep_genre(genre):
    
    dict_genres = {
        "Musical":"Music",
        "Crime":"Thriller",
        "Film-Noir":"Thriller",
        "Biography":"Documentary",
        "Sci-Fi":"Sci-Fi & Fantasy",
        "Fantasy":"Sci-Fi & Fantasy",
        "Action":"Action & Adventure",
        "Adventure":"Action & Adventure"
    }
        
    genre = genre.replace(" ","")
    
    if len(genre.split(",")) > 0:
        genres = genre.split(",")
        genres = ",".join(list(set([ dict_genres[genre] if genre in dict_genres.keys() else genre for genre in genres ])))
        
    return genres
    

In [3]:
df['Genres'] = df['Genres'].apply(prep_genre)

In [4]:
dummy_genres = df['Genres'].str.join(sep='').str.get_dummies(sep=',')
df = df.join(dummy_genres)

In [5]:
df

Unnamed: 0,Title,Plot,Genres,Action & Adventure,Animation,Comedy,Documentary,Drama,Family,History,Horror,Music,Romance,Sci-Fi & Fantasy,Sport,Thriller,War,Western
0,Who is Amos Otis?,"After assassinating the President, Amos Otis p...",Drama,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,Wheel of Time,Wheel of Time is Werner Herzog's photographed ...,Documentary,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,'B' Girl Rhapsody,Burlesque beauties performing their signature ...,Drama,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
5,The Brass Bottle,After being released from his bottle by Harold...,"Comedy,Sci-Fi & Fantasy",0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
6,The Morning After,The Morning After is a feature film that consi...,"Drama,Comedy",0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38044,100 Bloody Acres,The use of dead car crash victims in the Morga...,"Action & Adventure,Comedy,Horror",1,0,1,0,0,0,0,1,0,0,0,0,0,0,0
38045,10 Years,"The night of their high school reunion, a grou...","Action & Adventure,Drama,Comedy,Romance",1,0,1,0,1,0,0,0,0,1,0,0,0,0,0
38046,10 Things I Hate About You,"Adapted from William Shakespeare's play ""The T...","Action & Adventure,Drama,Comedy,Romance",1,0,1,0,1,0,0,0,0,1,0,0,0,0,0
38047,+1,Three college friends hit the biggest party of...,"Horror,Sci-Fi & Fantasy,Romance,Action & Adven...",1,0,0,0,0,0,0,1,0,1,1,0,1,0,0


In [6]:
df[df['Action & Adventure'] == 1].shape

(25359, 18)

In [7]:
import re

#plot cleaning
df.Plot = df.Plot.apply(lambda x: re.sub("\\x97.*" , "", x).rstrip())
df.Plot.values[-100]

"Set in Ireland during the War of Independence, two sisters' lives are changed forever as they care for a wounded soldier in their home. What transpires in this historical period drama, is a tragic love story of an Anglo-Irish household and its inhabitants, caught in the crucible of deep dark secrets. Framed against a backdrop of a turbulent war-torn Ireland in the early1920's, May Collingwood is forced to make critical and difficult decisions when she rescues a British soldier and must now protect herself and sister Tilly. They live in fear of the British Black and Tans, the rising IRA, their own entrapment, and ultimately the dark secrets of un-requited love unfolding from within."

### SVC

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split

X = df['Plot'].values
y = np.array(df.drop(['Title','Plot','Genres'], axis=1))


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=1)

In [10]:
X_train[0], y_train[0]

("A portrait of singer/songwriter Shawn Mendes' life, chronicling the past few years of his rise and journey.",
 array([1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64))

In [11]:
y_test[:,0].sum()

5090

In [12]:
import nltk
import spacy

nlp = spacy.load('en_core_web_sm')

doc_counter = 0
def reset_counter():
  global doc_counter
  doc_counter = 0

def increase_counter():
  global doc_counter
  doc_counter += 1
  if doc_counter % 100 == 0:
    print(doc_counter)

def spacy_nlp_tokenizer(text):
    increase_counter()
        
    doc = nlp(text)
    # lemmatized tokens, skipping stopwords
    lemmas = ['LEMMA_'+token.lemma_ for token in doc if not token.is_stop and token.ent_type != "PERSON"]

    lemma_bigrams = ['BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(lemmas,2)]
    #lemma_trigrams = ['TRI_'+p1+'_'+p2+'_'+p3 for p1,p2,p3 in nltk.ngrams(lemmas,3)]

    all_tokens = list()
    all_tokens.extend(lemmas)
    all_tokens.extend(lemma_bigrams)
    
    return all_tokens

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(analyzer=spacy_nlp_tokenizer, min_df=5)

X_train_tok = vect.fit_transform(X_train)

X_test_tok = vect.transform(X_test)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
1850

In [14]:
X_train_tok.shape, X_test_tok.shape, y_train.shape, y_test.shape

((27049, 47095), (6763, 47095), (27049, 15), (6763, 15))

In [33]:
#writing features to pickle

import pickle

with open('x_train_tok_multilabel.pkl','wb') as train_outfile:
    pickle.dump(X_train_tok,train_outfile)
with open('x_test_tok_multilabel.pkl','wb') as test_outfile:
    pickle.dump(X_test_tok,test_outfile)

In [15]:
#reading from pickle
import pickle

with open('x_train_tok.pkl','rb') as train_file:
    X_train_tok = pickle.load(train_file)

with open('x_test_tok.pkl','rb') as test_file:
    X_test_tok = pickle.load(test_file)

In [15]:
from sklearn.preprocessing import MaxAbsScaler #if its a dense matrix else use MaxAbsScaler in case of sparse matrix
scaler = MaxAbsScaler()
X_train_tok = scaler.fit_transform(X_train_tok)
X_test_tok = scaler.transform(X_test_tok)

## Grid search

In [19]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV



opt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('clf', MultiOutputClassifier(LinearSVC()))  # learning algorithm
    ])

param_grid = [
    {'sel__k': [1000, 2000, 5000], 'clf__estimator__C': [0.01, 0.1, 1]}, 
 ]

n_jobs = 5

opt_search = GridSearchCV(opt_pipeline, param_grid, scoring="f1_macro", cv=5, n_jobs = n_jobs, verbose=3).fit(X_train_tok,y_train)





Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [20]:
# Get predictions for test data
y_test_pred = opt_search.predict(X_test_tok)

### Evaluation

In [23]:
from sklearn.metrics import f1_score

f1_score(y_test, y_test_pred, average="micro"), f1_score(y_test, y_test_pred, average="macro")

(0.6446650978591517, 0.5056051745552421)

In [24]:
from sklearn.metrics import multilabel_confusion_matrix

multilabel_confusion_matrix(y_test, y_test_pred)

array([[[ 157, 1516],
        [ 354, 4736]],

       [[6406,   47],
        [ 246,   64]],

       [[3982,  638],
        [1149,  994]],

       [[5606,  203],
        [ 469,  485]],

       [[2308, 1105],
        [1085, 2265]],

       [[6057,  109],
        [ 451,  146]],

       [[6395,   47],
        [ 271,   50]],

       [[5764,  165],
        [ 371,  463]],

       [[6249,   72],
        [ 302,  140]],

       [[5253,  308],
        [ 781,  421]],

       [[5575,  208],
        [ 609,  371]],

       [[6518,   36],
        [ 145,   64]],

       [[4541,  468],
        [ 917,  837]],

       [[6447,   59],
        [ 145,  112]],

       [[6573,   11],
        [  95,   84]]], dtype=int64)

### Feature inspection

In [25]:
tokenizer = vect
selector = opt_search.best_estimator_.named_steps['sel']
classifier = opt_search.best_estimator_.named_steps['clf']
feature_names = tokenizer.get_feature_names()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(selector.get_support(),selector.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score)
print("BEST FEATURES")
feats_w_score[:-100:-1]

BEST FEATURES


[(911.8139825376867, True, 'BI_LEMMA_II_LEMMA_,'),
 (851.894017040664, True, 'LEMMA_outlaw'),
 (830.4592473744627, True, 'LEMMA_War'),
 (786.951028254656, True, 'LEMMA_documentary'),
 (766.7892626607419, True, 'BI_LEMMA_World_LEMMA_War'),
 (692.590595287279, True, 'LEMMA_music'),
 (653.7717133469964, True, 'LEMMA_sport'),
 (629.434668444883, True, 'BI_LEMMA_War_LEMMA_II'),
 (620.1020672577341, True, 'LEMMA_II'),
 (613.8879388886535, True, 'LEMMA_war'),
 (602.9455347191704, True, 'BI_LEMMA_Civil_LEMMA_War'),
 (543.2924678376328, True, 'LEMMA_Civil'),
 (540.2601895300696, True, 'LEMMA_gunslinger'),
 (527.367756669383, True, 'LEMMA_soldier'),
 (510.06896315154165, True, 'LEMMA_coach'),
 (503.04867356199264, True, 'LEMMA_champion'),
 (502.96154045456916, True, 'LEMMA_singer'),
 (493.9701894715876, True, 'LEMMA_musical'),
 (488.33093874918995, True, 'BI_LEMMA_II_LEMMA_.'),
 (486.3447301437959, True, 'LEMMA_cowboy'),
 (477.6325290159718, True, 'LEMMA_love'),
 (473.63738802990486, True, 'LEMM