In [1]:
import pandas as pd

## Data Exploration

In [11]:
# Read training data from csv
df = pd.read_csv('datasets/train_wiki.csv', usecols=['Title', 'Genre', 'Plot'], na_values='unknown')
df.head()

Unnamed: 0,Title,Genre,Plot
0,Kansas Saloon Smashers,,"A bartender is working at a saloon, serving dr..."
1,Love by the Light of the Moon,,"The moon, painted with a smiling face hangs ov..."
2,The Martyred Presidents,,"The film, just over a minute long, is composed..."
3,"Terrible Teddy, the Grizzly King",,Lasting just 61 seconds and consisting of two ...
4,Jack and the Beanstalk,,The earliest known adaptation of the classic f...


In [16]:
# Strip whitespace, remove extraneous genres if more than one per row
def clean_genre(s):
    delims = ('/', ',', '(')
    for delim in delims:
        idx = s.find(delim)
        if not idx == -1:
            s = s[:idx]
    return s.strip()

# Remove rows with no genre
df = df[pd.notnull(df['Genre'])]
# Clean Genre column
df['Genre'] = df['Genre'].map(clean_genre)
# Generate column of genre_ids
df['genre_id'] = df['Genre'].factorize()[0]
# DF to keep track of genre and respective id
genre_id_df = df[['Genre', 'genre_id']].drop_duplicates().sort_values('genre_id')
# dicts for quick genre id lookup
genre_to_id = dict(genre_id_df.values)
id_to_category = dict(genre_id_df[['genre_id', 'Genre']].values)

df.head()


Unnamed: 0,Title,Genre,Plot,genre_id
6,The Suburbanite,comedy,The film is about a family who move to the sub...,0
9,Dream of a Rarebit Fiend,short,The Rarebit Fiend gorges on Welsh rarebit at a...,1
10,From Leadville to Aspen: A Hold-Up in the Rockies,short action,The film features a train traveling through th...,2
11,Kathleen Mavourneen,short film,Irish villager Kathleen is a tenant of Captain...,3
12,Daniel Boone,biographical,Boone's daughter befriends an Indian maiden as...,4
