In [102]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [103]:
meta = pd.read_csv("movieset20k.csv", sep = ',', names=['titles', 'summary', 'genre1', 'genre2', 'genre3'])
meta.head()

Unnamed: 0,titles,summary,genre1,genre2,genre3
0,Doctor Strange in the Multiverse of Madness,Dr. Stephen Strange casts a forbidden spell th...,Action,Adventure,Fantasy
1,The Batman,When a sadistic serial killer begins murdering...,Action,Crime,Drama
2,The Northman,From visionary director Robert Eggers comes Th...,Action,Adventure,Drama
3,Everything Everywhere All at Once,An aging Chinese immigrant is swept up in an i...,Action,Adventure,Comedy
4,Uncharted,Street-smart Nathan Drake is recruited by seas...,Action,Adventure,


In [105]:
new_genres = []
for row in meta.iterrows():
    new_genres.append([])
    if row[1]["genre1"] != "":
        new_genres[-1].append(row[1]["genre1"] )
    if row[1]["genre2"] != "":
        new_genres[-1].append(row[1]["genre2"])
    if row[1]["genre3"] != "":
        new_genres[-1].append(row[1]["genre3"])
meta["genres"] = new_genres
meta.head()

Unnamed: 0,titles,summary,genre1,genre2,genre3,genres
0,Doctor Strange in the Multiverse of Madness,Dr. Stephen Strange casts a forbidden spell th...,Action,Adventure,Fantasy,"[Action, Adventure, Fantasy]"
1,The Batman,When a sadistic serial killer begins murdering...,Action,Crime,Drama,"[Action, Crime, Drama]"
2,The Northman,From visionary director Robert Eggers comes Th...,Action,Adventure,Drama,"[Action, Adventure, Drama]"
3,Everything Everywhere All at Once,An aging Chinese immigrant is swept up in an i...,Action,Adventure,Comedy,"[Action, Adventure, Comedy]"
4,Uncharted,Street-smart Nathan Drake is recruited by seas...,Action,Adventure,,"[Action, Adventure, ]"


In [109]:
def clean_text(text):
    text = re.sub("\'", "", text) 
    text = re.sub("[^a-zA-Z]"," ",text) 
    text = ' '.join(text.split()) 
    text = text.lower() 
    
    return text

In [110]:
meta['clean_summary'] = meta['summary'].apply(lambda x: clean_text(x))


In [111]:
meta

Unnamed: 0,titles,summary,genre1,genre2,genre3,genres,clean_summary
0,Doctor Strange in the Multiverse of Madness,Dr. Stephen Strange casts a forbidden spell th...,Action,Adventure,Fantasy,"[Action, Adventure, Fantasy]",dr stephen strange casts a forbidden spell tha...
1,The Batman,When a sadistic serial killer begins murdering...,Action,Crime,Drama,"[Action, Crime, Drama]",when a sadistic serial killer begins murdering...
2,The Northman,From visionary director Robert Eggers comes Th...,Action,Adventure,Drama,"[Action, Adventure, Drama]",from visionary director robert eggers comes th...
3,Everything Everywhere All at Once,An aging Chinese immigrant is swept up in an i...,Action,Adventure,Comedy,"[Action, Adventure, Comedy]",an aging chinese immigrant is swept up in an i...
4,Uncharted,Street-smart Nathan Drake is recruited by seas...,Action,Adventure,,"[Action, Adventure, ]",street smart nathan drake is recruited by seas...
...,...,...,...,...,...,...,...
20021,American Folk,"Two strangers, both folk musicians stranded in...",Drama,Musical,,"[Drama, Musical, ]",two strangers both folk musicians stranded in ...
20022,Chori Mera Kaam,Bhola Nath is a young petty criminal/robber/pi...,Comedy,Musical,Romance,"[Comedy, Musical, Romance]",bhola nath is a young petty criminal robber pi...
20023,Divine Will,"Life in Punkyville, Kentucky has always been a...",Comedy,Fantasy,Musical,"[Comedy, Fantasy, Musical]",life in punkyville kentucky has always been a ...
20024,Seven Days' Leave,Soldier Johnny Grey is engaged to marry singer...,Comedy,Musical,Romance,"[Comedy, Musical, Romance]",soldier johnny grey is engaged to marry singer...


In [112]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

meta['clean_summary'] = meta['clean_summary'].apply(lambda x: remove_stopwords(x))

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [113]:
meta

Unnamed: 0,titles,summary,genre1,genre2,genre3,genres,clean_summary
0,Doctor Strange in the Multiverse of Madness,Dr. Stephen Strange casts a forbidden spell th...,Action,Adventure,Fantasy,"[Action, Adventure, Fantasy]",dr stephen strange casts forbidden spell opens...
1,The Batman,When a sadistic serial killer begins murdering...,Action,Crime,Drama,"[Action, Crime, Drama]",sadistic serial killer begins murdering key po...
2,The Northman,From visionary director Robert Eggers comes Th...,Action,Adventure,Drama,"[Action, Adventure, Drama]",visionary director robert eggers comes northma...
3,Everything Everywhere All at Once,An aging Chinese immigrant is swept up in an i...,Action,Adventure,Comedy,"[Action, Adventure, Comedy]",aging chinese immigrant swept insane adventure...
4,Uncharted,Street-smart Nathan Drake is recruited by seas...,Action,Adventure,,"[Action, Adventure, ]",street smart nathan drake recruited seasoned t...
...,...,...,...,...,...,...,...
20021,American Folk,"Two strangers, both folk musicians stranded in...",Drama,Musical,,"[Drama, Musical, ]",two strangers folk musicians stranded californ...
20022,Chori Mera Kaam,Bhola Nath is a young petty criminal/robber/pi...,Comedy,Musical,Romance,"[Comedy, Musical, Romance]",bhola nath young petty criminal robber pickpoc...
20023,Divine Will,"Life in Punkyville, Kentucky has always been a...",Comedy,Fantasy,Musical,"[Comedy, Fantasy, Musical]",life punkyville kentucky always bit unconventi...
20024,Seven Days' Leave,Soldier Johnny Grey is engaged to marry singer...,Comedy,Musical,Romance,"[Comedy, Musical, Romance]",soldier johnny grey engaged marry singer mapy ...


In [123]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(meta['genres'])

# transform target variable
y = multilabel_binarizer.transform(meta['genres'])

y

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [124]:
xtrain, xval, ytrain, yval = train_test_split(meta['clean_summary'], y, test_size=0.2, random_state=9)

In [125]:
all_words = []
for value in meta['clean_summary']:
    all_words.extend(value.split())

all_words = set(all_words)
all_words_dic = {}
for i, word in enumerate(all_words):
    all_words_dic[word] = i
    

In [126]:
def vectorize(sent):
    ''' This function takes list of words in a sentence as input 
    and returns a vector of size of filtered_vocab.It puts 0 if the 
    word is not present in tokens and count of token if present.'''
    tokens = sent.split()
    vector=[0] * len(all_words)
    for token in tokens:
        vector[all_words_dic[token]] += 1 
    return vector

In [127]:
train_vectors = []
for sentence in xtrain:
    train_vectors.append(vectorize(sentence))
    

In [128]:
test_vectors = []
for sentence in xval:
    test_vectors.append(vectorize(sentence))

In [129]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score


lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [131]:
clf.fit(train_vectors, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [133]:
y_pred = clf.predict(test_vectors)

In [135]:
f1_score(yval, y_pred, average="micro")

0.4775438232548491