In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error


In [16]:
data = pd.read_csv('../data/train.csv')
flo_genre = pd.read_csv('../data/flo_genre_feature.csv').drop('Unnamed: 0', axis=1)
flo_age = pd.read_csv('../data/flo_age_feature.csv').drop('Unnamed: 0', axis=1)
marcel_keywords = pd.read_csv('../data/marcel_features.csv').drop('Unnamed: 0', axis=1)

data['popularity'] = np.log1p(data.popularity)
data = data.merge(flo_genre, on=['id'], how='left').merge(marcel_keywords, on=['id'], how='left').merge(flo_age, on=['id'], how='left')

data.runtime = data.runtime.fillna(np.mean(data.runtime))

In [17]:
features = list(flo_genre.columns) + ['popularity', 'budget', 'has_top_keyword']
features.remove('id')

features_df = data[features]
target = data.revenue

X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.33, random_state=42)

reg = LinearRegression()
reg.fit(X_train, np.log1p(y_train))
y_pred = np.exp(reg.predict(X_test)) - 1
np.sqrt(mean_squared_log_error(y_test, y_pred))

2.209947521255649

In [4]:
features2 = features + ['runtime']

features_df = data[features2]
target = data.revenue
X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.33, random_state=42)

reg = LinearRegression()
reg.fit(X_train, np.log1p(y_train))
y_pred = np.exp(reg.predict(X_test)) - 1
np.sqrt(mean_squared_log_error(y_test, y_pred))

2.2130484152059466

In [5]:
features3 = features + ['age']

features_df = data[features3]
target = data.revenue
X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.33, random_state=42)

reg = LinearRegression()
reg.fit(X_train, np.log1p(y_train))
y_pred = np.exp(reg.predict(X_test)) - 1
np.sqrt(mean_squared_log_error(y_test, y_pred))

2.2278545800207255

In [6]:
data.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,Fantasy,Romance,History,Crime,War,TV Movie,Thriller,Foreign,has_top_keyword,age
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",2.024905,...,False,False,False,False,False,False,False,False,True,4.063014
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,2.224504,...,False,True,False,False,False,False,False,False,False,14.610959
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",4.178992,...,False,False,False,False,False,False,False,False,False,4.427397
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,1.429099,...,False,False,False,False,False,False,True,False,False,7.016438
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,0.76457,...,False,False,False,False,False,False,True,False,False,10.106849


In [10]:
data.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,Romance,History,Crime,War,TV Movie,Thriller,Foreign,has_top_keyword,age,original_title_length
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",2.024905,...,False,False,False,False,False,False,False,True,4.063014,22
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,2.224504,...,True,False,False,False,False,False,False,False,14.610959,40
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",4.178992,...,False,False,False,False,False,False,False,False,4.427397,8
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,1.429099,...,False,False,False,False,False,True,False,False,7.016438,7
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,0.76457,...,False,False,False,False,False,True,False,False,10.106849,4


In [26]:
features4 = features + ['original_title_length']

features_df = data[features4]
target = data.revenue
X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.33, random_state=42)

reg = LinearRegression()
reg.fit(X_train, np.log1p(y_train))
y_pred = np.exp(reg.predict(X_test)) - 1
np.sqrt(mean_squared_log_error(y_test, y_pred))

2.210585626253336