In [1]:
import pandas as pd
import matplotlib.pyplot as plt


## Data Exploration

In [2]:
# Read training data from csv
df = pd.read_csv('datasets/train_wiki.csv', usecols=['Genre', 'Plot'], na_values='unknown')
df.head()

Unnamed: 0,Genre,Plot
0,,"A bartender is working at a saloon, serving dr..."
1,,"The moon, painted with a smiling face hangs ov..."
2,,"The film, just over a minute long, is composed..."
3,,Lasting just 61 seconds and consisting of two ...
4,,The earliest known adaptation of the classic f...


In [3]:
# Strip whitespace, remove extraneous genres if more than one per row
def clean_genre(s):
    delims = ('/', ',', '(', '[')
    for delim in delims:
        idx = s.find(delim)
        if not idx == -1:
            s = s[:idx]
    return s.strip()

# Remove rows with no genre
df = df[pd.notnull(df['Genre'])]
# Clean Genre column
df['Genre'] = df['Genre'].map(clean_genre)
# Remove rows where genre is less than 2 letters
df = df[df['Genre'].map(lambda x: len(x) > 1)]
# Generate column of genre_ids
df['genre_id'] = df['Genre'].factorize()[0]
# DF to keep track of genre and respective id
genre_id_df = df[['Genre', 'genre_id']].drop_duplicates().sort_values('genre_id')
# dicts for quick genre id lookup
genre_to_id = dict(genre_id_df.values)
id_to_category = dict(genre_id_df[['genre_id', 'Genre']].values)

df.head()


Unnamed: 0,Genre,Plot,genre_id
6,comedy,The film is about a family who move to the sub...,0
9,short,The Rarebit Fiend gorges on Welsh rarebit at a...,1
10,short action,The film features a train traveling through th...,2
11,short film,Irish villager Kathleen is a tenant of Captain...,3
12,biographical,Boone's daughter befriends an Indian maiden as...,4


## Imbalanced Classes
The number of movies per genre is imbalanced, with a few genres containing most of the movies. Because we want our classifier to give high prediction accuracy over the majority class while maintaining reasonable accuracy for the minority classes, we will leave it as is.

In [4]:
# Q: How is the distribution of movie genres?
genre_counts = df.groupby('Genre').Plot.count()
plt.hist(genre_counts)
plt.title('Movie genre distribution')
plt.ylabel('Frequency')
plt.xlabel('Number of movies per genre')
plt.show()
# A: Most genres have very few movies

min_movies = 50
unpop_genres = (df.groupby('Genre').Plot.count() <= min_movies).sum()
num_genres = len(genre_id_df)
print("{:.2%} ({}) of {} genres have {} or fewer movies".format(unpop_genres/num_genres, unpop_genres
                                                              , num_genres, min_movies))
genre_popularity = df.groupby('Genre').Plot.count().sort_values(ascending=False)
print("The most popular genre, {}, has {} movies".format(genre_popularity.index[0], genre_popularity[0]))


94.58% (751) of 794 genres have 50 or fewer movies
The most popular genre, drama, has 6160 movies


In [5]:
# Top 50 genres
popular_genres = set()
for i in range(5):
    popular_genres.add(genre_popularity.index[i])
# Remove entries that do not fall in the 50 top genres
print("DF size before filter: {}".format(df.size))
df = df[df['Genre'].map(lambda x: x in popular_genres)]
print("DF size after filter: {}".format(df.size))
#print(popular_genres)

# ReGenerate column of genre_ids
df['genre_id'] = df['Genre'].factorize()[0]
# DF to keep track of genre and respective id
genre_id_df = df[['Genre', 'genre_id']].drop_duplicates().sort_values('genre_id')
# dicts for quick genre id lookup
genre_to_id = dict(genre_id_df.values)
id_to_category = dict(genre_id_df[['genre_id', 'Genre']].values)



DF size before filter: 77736
DF size after filter: 44442


## Text Representation

Convert plot text into vectors with fixed size using bag of words model. Order of words is not preserved.

Calculate tf-idf for each plot description.

* `sublindear_tf` is set to `True` to use a logarithmic form for frequency
* `min_df` is the minimum number of documents a word must be present in to be kept
* `norm` is set to `l2` to ensure all our feature vectors have a euclidian norm of 1
* `ngram_range` is set to `(1, 2)` to indicate that we want to consider both unigrams and bigrams
* `stop_words` is set to `'english'` to remove all common pronouns to reduce the number of noisy features

In [6]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.Plot).toarray()
labels = df.genre_id
features.shape

CPU times: user 18.5 s, sys: 1.59 s, total: 20.1 s
Wall time: 20.4 s


Now, each of 25912 plot descriptions is represented by 119365 features, representing the tf-idf score for different unigrams and bigrams.

Use `sklearn.feature_selection.chi2` to find the terms that are the most correlated with each of the genres.

In [7]:
from sklearn.feature_selection import chi2
import numpy as np

N = 2
for Genre, genre_id in sorted(genre_to_id.items()):
    features_chi2 = chi2(features, labels == genre_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Genre))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))


# 'action':
  . Most correlated unigrams:
. kills
. kill
  . Most correlated bigrams:
. prakash raj
. police officer


# 'comedy':
  . Most correlated unigrams:
. death
. killed
  . Most correlated bigrams:
. jerry lewis
. mother riley


# 'drama':
  . Most correlated unigrams:
. vampire
. war
  . Most correlated bigrams:
. year old
. world war


# 'horror':
  . Most correlated unigrams:
. vampire
. creature
  . Most correlated bigrams:
. mrs bud
. van helsing


# 'romance':
  . Most correlated unigrams:
. rahul
. love
  . Most correlated bigrams:
. fall love
. love story


# Multi-Class Classifer: Features and Designs

Now that we have a vector representation of the plot descriptions we can train supervised classifiers to train unseen plot descriptions and precit the genre on which they fall.

**Naive Bayes Classifier**: the classifier most suitable for word counts is the multinomial variant

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df['Plot'], df['Genre'], random_state=0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)

After fitting the training set, let's make some predictions

In [9]:
print(clf.predict(count_vect.transform(["kill kill kill kills"])))
print(clf.predict(count_vect.transform(["love"])))
print(clf.predict(count_vect.transform(["The members of a nina empire are in possession of an object of power composed of three individual pieces of a ninja sculpture which, when combined, makes the owner's arms impervious to blades. Other ninjas feel the ninja empire is in need of reform and steal two of the three sculpture pieces. The ninja empire retaliates with threatening messages delivered via tiny robots which demand return of the powerful pieces. Ninja empire minions forge multiple attempts to attack the thieves but are quickly thwarted."])))
print(clf.predict(count_vect.transform(["Valentine's day is filled love love with love as lovely blossoms"
                                        "bloom and increase the delicious bagels of heart and chocolates"])))
print(clf.predict(count_vect.transform(["James Bond enters the destruction temple of doom and"
                                        "attempts to fight and kill his way through deadly traps."])))

['action']
['drama']
['drama']
['drama']
['drama']


## Model Selection

Benchmark the following four models:
* Logistic regression
* (Multinominal) Naive Bayes
* [Linear Support Vector Machine](https://medium.com/machine-learning-101/chapter-2-svm-support-vector-machine-theory-f0812effc72)
* Random Forest

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import seaborn as sns

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0)
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
    

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df,
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()






















In [11]:
cv_df.groupby('model_name').accuracy.mean()


model_name
LinearSVC                 0.609076
LogisticRegression        0.616845
MultinomialNB             0.515315
RandomForestClassifier    0.415823
Name: accuracy, dtype: float64