In [72]:
import pandas as pd
import matplotlib.pyplot as plt


## Data Exploration

In [2]:
# Read training data from csv
df = pd.read_csv('datasets/train_wiki.csv', usecols=['Genre', 'Plot'], na_values='unknown')
df.head()

Unnamed: 0,Title,Genre,Plot
0,Kansas Saloon Smashers,,"A bartender is working at a saloon, serving dr..."
1,Love by the Light of the Moon,,"The moon, painted with a smiling face hangs ov..."
2,The Martyred Presidents,,"The film, just over a minute long, is composed..."
3,"Terrible Teddy, the Grizzly King",,Lasting just 61 seconds and consisting of two ...
4,Jack and the Beanstalk,,The earliest known adaptation of the classic f...


In [57]:
# Strip whitespace, remove extraneous genres if more than one per row
def clean_genre(s):
    delims = ('/', ',', '(', '[')
    for delim in delims:
        idx = s.find(delim)
        if not idx == -1:
            s = s[:idx]
    return s.strip()

# Remove rows with no genre
df = df[pd.notnull(df['Genre'])]
# Clean Genre column
df['Genre'] = df['Genre'].map(clean_genre)
# Remove rows where genre is less than 2 letters
df = df[df['Genre'].map(lambda x: len(x) > 1)]
# Generate column of genre_ids
df['genre_id'] = df['Genre'].factorize()[0]
# DF to keep track of genre and respective id
genre_id_df = df[['Genre', 'genre_id']].drop_duplicates().sort_values('genre_id')
# dicts for quick genre id lookup
genre_to_id = dict(genre_id_df.values)
id_to_category = dict(genre_id_df[['genre_id', 'Genre']].values)

df.head()


Unnamed: 0,Title,Genre,Plot,genre_id
6,The Suburbanite,comedy,The film is about a family who move to the sub...,0
9,Dream of a Rarebit Fiend,short,The Rarebit Fiend gorges on Welsh rarebit at a...,1
10,From Leadville to Aspen: A Hold-Up in the Rockies,short action,The film features a train traveling through th...,2
11,Kathleen Mavourneen,short film,Irish villager Kathleen is a tenant of Captain...,3
12,Daniel Boone,biographical,Boone's daughter befriends an Indian maiden as...,4


## Imbalanced Classes
The number of movies per genre is imbalanced, with a few genres containing most of the movies. Because we want our classifier to give high prediction accuracy over the majority class while maintaining reasonable accuracy for the minority classes, we will leave it as is.

In [89]:
# Q: How is the distribution of movie genres?
genre_counts = df.groupby('Genre').Plot.count()
plt.hist(genre_counts)
plt.title('Movie genre distribution')
plt.ylabel('Frequency')
plt.xlabel('Number of movies per genre')
plt.show()
# A: Most genres have very few movies

min_movies = 50
unpop_genres = (df.groupby('Genre').Plot.count() <= min_movies).sum()
num_genres = len(genre_id_df)
print("{:.2%} ({}) of {} genres have {} or fewer movies".format(unpop_genres/num_genres, unpop_genres
                                                              , num_genres, min_movies))
genre_popularity = df.groupby('Genre').Plot.count().sort_values(ascending=False)
print("The most popular genre, {}, has {} movies".format(genre_popularity.index[0], genre_popularity[0]))


94.58% (751) of 794 genres have 50 or fewer movies
The most popular genre, drama, has 6160 movies


'historical fantasy'

In [121]:
# Top 50 genres
popular_genres = set()
for i in range(5):
    popular_genres.add(genre_popularity.index[i])
# Remove entries that do not fall in the 50 top genres
print("DF size before filter: {}".format(df.size))
df = df[df['Genre'].map(lambda x: x in popular_genres)]
print("DF size after filter: {}".format(df.size))
#print(popular_genres)

# ReGenerate column of genre_ids
df['genre_id'] = df['Genre'].factorize()[0]
# DF to keep track of genre and respective id
genre_id_df = df[['Genre', 'genre_id']].drop_duplicates().sort_values('genre_id')
# dicts for quick genre id lookup
genre_to_id = dict(genre_id_df.values)
id_to_category = dict(genre_id_df[['genre_id', 'Genre']].values)



DF size before filter: 59256
DF size after filter: 59256


## Text Representation

Convert plot text into vectors with fixed size using bag of words model. Order of words is not preserved.

Calculate tf-idf for each plot description.

* `sublindear_tf` is set to `True` to use a logarithmic form for frequency
* `min_df` is the minimum number of documents a word must be present in to be kept
* `norm` is set to `l2` to ensure all our feature vectors have a euclidian norm of 1
* `ngram_range` is set to `(1, 2)` to indicate that we want to consider both unigrams and bigrams
* `stop_words` is set to `'english'` to remove all common pronouns to reduce the number of noisy features

In [122]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.Plot).toarray()
labels = df.genre_id
features.shape

CPU times: user 19.5 s, sys: 2.61 s, total: 22.1 s
Wall time: 22.2 s


Now, each of 25912 plot descriptions is represented by 119365 features, representing the tf-idf score for different unigrams and bigrams.

Use `sklearn.feature_selection.chi2` to find the terms that are the most correlated with each of the genres.

In [123]:
from sklearn.feature_selection import chi2
import numpy as np

N = 2
for Genre, genre_id in sorted(genre_to_id.items()):
    features_chi2 = chi2(features, labels == genre_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Genre))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))


# 'action':
  . Most correlated unigrams:
. kills
. kill
  . Most correlated bigrams:
. prakash raj
. police officer


# 'comedy':
  . Most correlated unigrams:
. death
. killed
  . Most correlated bigrams:
. jerry lewis
. mother riley


# 'drama':
  . Most correlated unigrams:
. vampire
. war
  . Most correlated bigrams:
. year old
. world war


# 'horror':
  . Most correlated unigrams:
. vampire
. creature
  . Most correlated bigrams:
. mrs bud
. van helsing


# 'romance':
  . Most correlated unigrams:
. rahul
. love
  . Most correlated bigrams:
. fall love
. love story


# Multi-Class Classifer: Features and Designs

Now that we have a vector representation of the plot descriptions we can train supervised classifiers to train unseen plot descriptions and precit the genre on which they fall.

**Naive Bayes Classifier**: the classifier most suitable for word counts is the multinomial variant