## importing all the libraries

In [63]:
import nltk
import pandas as pd
import numpy as np
import math
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.svm import LinearSVC 
from sklearn.metrics import f1_score #for measuring performance
from sklearn import preprocessing
from nltk.corpus import stopwords
import tqdm
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import defaultdict
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lovey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lovey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lovey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lovey\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Reading and Performing preprocessing 

In [64]:
# function for preprocessing the data to make the data clean and to increase efficiency

def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower()
    return text

In [65]:
#reading the testing and training data

data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

data.head()

Unnamed: 0,movie_id,movie_name,plot,genre
0,23890098,Taxi Blues,"Shlykov, a hard-working taxi driver and Lyosha...","['World cinema', 'Drama']"
1,31186339,The Hunger Games,The nation of Panem consists of a wealthy Capi...,"['Action/Adventure', 'Action', 'Science Fictio..."
2,20663735,Narasimham,Poovalli Induchoodan is sentenced for six yea...,"['Musical', 'Action', 'Drama']"
3,2231378,The Lemon Drop Kid,"The Lemon Drop Kid , a New York City swindler,...",['Comedy']
4,595909,A Cry in the Dark,Seventh-day Adventist Church pastor Michael Ch...,"['Crime Fiction', 'World cinema', 'Drama']"


In [66]:
#preprocessing continued cleaning 

data['clean_plot'] = data['plot'].apply(lambda x : clean_text(x))
data_test['clean_plot'] = data_test['plot'].apply(lambda x : clean_text(x))

In [67]:
#comparing the original plot and the clean plot after preprocessing

data.head()

Unnamed: 0,movie_id,movie_name,plot,genre,clean_plot
0,23890098,Taxi Blues,"Shlykov, a hard-working taxi driver and Lyosha...","['World cinema', 'Drama']",shlykov a hard working taxi driver and lyosha ...
1,31186339,The Hunger Games,The nation of Panem consists of a wealthy Capi...,"['Action/Adventure', 'Action', 'Science Fictio...",the nation of panem consists of a wealthy capi...
2,20663735,Narasimham,Poovalli Induchoodan is sentenced for six yea...,"['Musical', 'Action', 'Drama']",poovalli induchoodan is sentenced for six year...
3,2231378,The Lemon Drop Kid,"The Lemon Drop Kid , a New York City swindler,...",['Comedy'],the lemon drop kid a new york city swindler is...
4,595909,A Cry in the Dark,Seventh-day Adventist Church pastor Michael Ch...,"['Crime Fiction', 'World cinema', 'Drama']",seventh day adventist church pastor michael ch...


## Removing stopwords

In [68]:
#removing stopwords from the data 

stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

data['clean_plot'] = data['clean_plot'].apply(lambda x: remove_stopwords(x))
data_test['clean_plot'] = data_test['clean_plot'].apply(lambda x: remove_stopwords(x))

In [69]:
#visualizing the data after removal of stopwords

data.head()
data_test.head()

Unnamed: 0,movie_id,movie_name,plot,genre,clean_plot
0,33628861,Kan Simittum Neram,The film begins with a man ([[Karthik who tri...,['Thriller'],film begins man karthik tried kill lakshmi amb...
1,30413435,A Million,Eight people enter a reality TV show to win 10...,['Thriller'],eight people enter reality tv show win billion...
2,33643382,The Rainbowmaker,Datho has been innocent in prison for many ye...,['Romance Film'],datho innocent prison many years comes home no...
3,3961516,Distant,"Uzak tells the story of Yusuf , a young factor...","['World cinema', 'Drama']",uzak tells story yusuf young factory worker lo...
4,155303,Heavy Metal,The film's title sequence story opens with a...,"['Animation', 'Indie', 'Science Fiction', 'Adv...",films title sequence story opens space shuttle...


## Converting data to features

In [42]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(data['genre'])
multilabel_binarizer.fit(data_test['genre'])

#transforming the target value
y_train = multilabel_binarizer.transform(data['genre'])
y_test = multilabel_binarizer.transform(data_test['genre'])

In [34]:
#verfying that the data is converted into features
y_train[0]

array([1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0])

## implementing TD-IDF

In [35]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', max_features=10000)

In [36]:
x_train = data['clean_plot']  #as plot is the only thing that decides the genre.
x_test = data_test['clean_plot'] #as plot is the only thing that decides the genre.

In [37]:
#creating TD-IDF features

x_train = tfidf_vectorizer.fit_transform(x_train)
x_test = tfidf_vectorizer.fit_transform(x_test)

## Building Machine learning Model

In [38]:
#building movie genre prediction model

from sklearn.multiclass import OneVsRestClassifier

svc = LinearSVC()
clf = OneVsRestClassifier(svc)

In [39]:
#fitting the model on train data

clf.fit(x_train, y_train)



OneVsRestClassifier(estimator=LinearSVC())

In [40]:
#making prediction for movie genre

y_pred = clf.predict(x_test)

In [41]:
#calculating F1 score

f1_score(y_test, y_pred, average="micro")

0.7030192686997421