In [1]:
import ast
from datetime import datetime
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from catboost import CatBoostRegressor, Pool, cv

%matplotlib inline

In [2]:
pd.options.display.max_columns = 99
plt.rcParams['figure.figsize'] = (16, 9)

# Data Preparation

## Load Data

In [3]:
df_train = pd.read_csv('data/train.csv', index_col='id')
df_test = pd.read_csv('data/test.csv', index_col='id')
print(df_train.shape, df_test.shape)
df_train.head()

(3000, 22) (4398, 21)


Unnamed: 0_level_0,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[{'iso_3166_1': 'US', 'name': 'United States o...",8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,"[{'name': 'Bold Films', 'id': 2266}, {'name': ...","[{'iso_3166_1': 'US', 'name': 'United States o...",10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,/aTXRaPrWSinhcmCrcfJK17urp3F.jpg,,"[{'iso_3166_1': 'IN', 'name': 'India'}]",3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,/m22s7zvkVFDU9ir56PiiqIEWFdT.jpg,,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


## Prepare Columns

First, drop columns that won't be considered for this model.

In [4]:
df_train = df_train.drop(['imdb_id', 'poster_path'], axis=1)
df_test = df_test.drop(['imdb_id', 'poster_path'], axis=1)

### Non-Dictionary Columns

#### homepage

In [5]:
df_train['homepage'].value_counts()[df_train['homepage'].value_counts() > 1]

http://www.transformersmovie.com/    4
http://www.lordoftherings.net/       2
http://www.thehobbit.com/            2
Name: homepage, dtype: int64

Very few films share a homepage - they're either unique or missing. Create binary columns to capture this.

In [6]:
def prepare_homepage(df):
    df['has_homepage'] = 0
    df.loc[df['homepage'].notnull(), 'has_homepage'] = 1
    df.drop('homepage', axis=1, inplace=True)
    return df

df_train = prepare_homepage(df_train)
df_test = prepare_homepage(df_test)

#### original_language

For this column, each row only has a single value, so it can be left as a single column and be specified as a categorical variable in the CatBoost model. Given the infrequent occurence of many of the values, however, only the top 10 original languages will be kept.

In [7]:
top10_langs = df_train['original_language'].value_counts(dropna=False).index[:10]

df_train['original_language'] = df_train['original_language'].apply(lambda x: x if x in top10_langs else 'Other')
df_test['original_language'] = df_test['original_language'].apply(lambda x: x if x in top10_langs else 'Other')

#### release_date

Some dates don't specify 19xx or 20xx - these may need to be adjusted before converting to datetime to avoid conversion to invalid dates in the future.

In [8]:
def adjust_dates(x):
    year = x.split('/')[2]
    if int(year) <= 19:
        return x[:-2] + '20' + year
    else:
        return x[:-2] + '19' + year

In [9]:
df_train.loc[df_train['release_date'].isnull() == True, 'release_date'] = '01/01/00'
df_test.loc[df_test['release_date'].isnull() == True, 'release_date'] = '01/01/00'
df_train['release_date'] = df_train['release_date'].apply(adjust_dates)
df_test['release_date'] = df_test['release_date'].apply(adjust_dates)
df_train['release_date'] = pd.to_datetime(df_train['release_date'])
df_test['release_date'] = pd.to_datetime(df_test['release_date'])

In [10]:
def prepare_release_dates(df):
    df['release_year'] = df['release_date'].apply(lambda x: x.year) # Numerical
    df['release_month'] = df['release_date'].apply(lambda x: x.month) # Numerical
    df['release_dow'] = df['release_date'].apply(lambda x: x.dayofweek) # Categorical
    df.drop('release_date', axis=1, inplace=True)
    return df

df_train = prepare_release_dates(df_train)
df_test = prepare_release_dates(df_test)

#### status

In [11]:
df_train['status'].value_counts(dropna=False)

Released    2996
Rumored        4
Name: status, dtype: int64

In [12]:
df_test['status'].value_counts(dropna=False)

Released           4389
Post Production       5
Rumored               2
NaN                   2
Name: status, dtype: int64

There is very little information contained in the status column, so it can be dropped.

In [13]:
df_train.drop('status', axis=1, inplace=True)
df_test.drop('status', axis=1, inplace=True)

### Dictionary Columns

Some of the columns contain one or more dictionaries as strings. First, convert them to Python objects.

In [14]:
dict_cols = ['belongs_to_collection', 'genres', 'production_companies', 'production_countries',
             'spoken_languages', 'Keywords', 'cast', 'crew']

def prepare_dict_cols(df, dict_cols=dict_cols):
    for col in dict_cols:
        df[col] = df[col].apply(lambda x: [] if pd.isna(x) else ast.literal_eval(x))
        df[col] = df[col].apply(lambda x: [i['name'] for i in x] if x != [] else x)
    return df

df_train = prepare_dict_cols(df_train)
df_text = prepare_dict_cols(df_test)

In [15]:
df_train.head()

Unnamed: 0_level_0,belongs_to_collection,budget,genres,original_language,original_title,overview,popularity,production_companies,production_countries,runtime,spoken_languages,tagline,title,Keywords,cast,crew,revenue,has_homepage,release_year,release_month,release_dow
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,[Hot Tub Time Machine Collection],14000000,[Comedy],en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,"[Paramount Pictures, United Artists, Metro-Gol...",[United States of America],93.0,[English],The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[time travel, sequel, hot tub, duringcreditsst...","[Rob Corddry, Craig Robinson, Clark Duke, Adam...","[Kelly Cantley, Steve Pink, Josh Heald, Josh H...",12314651,0,2015,2,4
2,[The Princess Diaries Collection],40000000,"[Comedy, Drama, Family, Romance]",en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,[Walt Disney Pictures],[United States of America],113.0,[English],It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[coronation, duty, marriage, falling in love]","[Anne Hathaway, Julie Andrews, H√©ctor Elizond...","[Garry Marshall, Charles Minsky, John Debney, ...",95149435,0,2004,8,4
3,[],3300000,[Drama],en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,"[Bold Films, Blumhouse Productions, Right of W...",[United States of America],105.0,[English],The road to greatness can take you to the edge.,Whiplash,"[jazz, obsession, conservatory, music teacher,...","[Miles Teller, J.K. Simmons, Melissa Benoist, ...","[Terri Taylor, Richard Henderson, Jeffrey Stot...",13092000,1,2014,10,4
4,[],1200000,"[Thriller, Drama]",hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,[],[India],122.0,"[English, हिन्दी]",,Kahaani,"[mystery, bollywood, police corruption, crime,...","[Vidya Balan, Nawazuddin Siddiqui, Parambrata ...","[Sujoy Ghosh, Sujoy Ghosh, Sujoy Ghosh]",16000000,1,2012,3,4
5,[],0,"[Action, Thriller]",ko,마린보이,Marine Boy is the story of a former national s...,1.14807,[],[South Korea],118.0,[한국어/조선말],,Marine Boy,[],"[Kim Kang-woo, Jo Jae-hyeon, Park Si-yeon, Kim...","[Jong-seok Yoon, Jong-seok Yoon]",3923970,0,2009,2,3


#### belongs_to_collection

Create columns that capture if a film belongs to a collection.

In [16]:
def prepare_belongs_to_collection(df):
    df['has_collection'] = df['belongs_to_collection'].apply(lambda x: len(x) if x != [] else 0)
    return df

df_train = prepare_belongs_to_collection(df_train)
df_test = prepare_belongs_to_collection(df_test)

### Multi-Label Target Encoding

Create function to encode the multi-label categorical columns.

In [17]:
tr = df_train.copy()
te = df_test.copy()

In [18]:
df_train = tr.copy()
df_test = te.copy()

In [19]:
def ml_te(x, mean_revs, total_mean_rev):
    if len(x) == 0:
        return total_mean_rev
    
    rev, count = 0, 0
    for item in x:
        count += 1
        if item in mean_revs.keys():
            rev += mean_revs[item]
        else:
            rev += total_mean_rev
    return rev / count


def get_multilabel_targetencodings(df_train, df_test, col):
    total_mean_rev = df_train['revenue'].mean()
    counts, revs = Counter(), Counter()
    for items, revenue in df_train[[col, 'revenue']].values:
        for item in items:
            counts[item] += 1
            revs[item] += revenue
    
    mean_revs = {}
    for label in counts.keys():
        mean_revs[label] = revs[label] / counts[label]
    
    df_train[col] = df_train[col].apply(ml_te, args=(mean_revs, total_mean_rev))
    df_test[col] = df_test[col].apply(ml_te, args=(mean_revs, total_mean_rev))
        
    return df_train, df_test

In [None]:
for col in dict_cols:
    df_train, df_test = get_multilabel_targetencodings(df_train, df_test, col)

In [None]:
df_train.head()

# CatBoost Model

An initial model can be created, discarding the text columns for now.

In [None]:
X_train = df_train[['belongs_to_collection', 'budget', 'genres', 'original_language', 'popularity',
                    'production_companies', 'production_countries', 'runtime', 'spoken_languages',
                    'Keywords', 'cast', 'crew', 'has_homepage', 'release_year', 'release_month',
                    'release_dow', 'has_collection']]
X_test = df_test[['belongs_to_collection', 'budget', 'genres', 'original_language', 'popularity',
                  'production_companies', 'production_countries', 'runtime', 'spoken_languages',
                  'Keywords', 'cast', 'crew', 'has_homepage', 'release_year', 'release_month',
                  'release_dow', 'has_collection']]
y_train_log1p = df_train['revenue'].apply(np.log1p) # Use log of revenue to reflect RMSLE evaluation metric

In [None]:
X_train.head()

In [None]:
X_train.iloc[:5, [3, 15]]

In [None]:
categorical_features_indices = [3, 15]

In [None]:
import math
from catboost import Pool, CatBoostClassifier


class RMSLE(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)

        exponents = []
        for index in range(len(approxes)):
            exponents.append(math.exp(approxes[index]))

        result = []
        for index in range(len(targets)):
            p = exponents[index] / (1 + exponents[index])
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))

        return result

In [None]:
model = CatBoostRegressor(
    random_seed=42,
    logging_level='Silent'
)

In [None]:
cv_params = model.get_params()
cv_data = cv(
    Pool(X_train, y_train_log1p, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

In [None]:
model.fit(
    X_train, y_train_log1p,
    cat_features=categorical_features_indices,
    plot=True
)

In [None]:
predictions = model.predict(X_test)
predictions = np.expm1(predictions)

In [None]:
name = datetime.now().strftime("%Y%m%d%H%M%S")
submission = pd.DataFrame(X_test.index)
submission['revenue'] = predictions
#submission.to_csv('submissions/' + name + '-CatBoost-logRevenue.csv', index=False)