In [1]:
import pandas as pd
import numpy as np
from dateutil import parser

from common.features import extract_has_top_keyword, has_top_keyword, getTimeFeatures, getBinaryFeatures
import lightgbm as lgb
from sklearn.model_selection import train_test_split

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
data_orig = pd.read_csv('./data/train.csv')
data = data_orig.copy()

In [3]:
# Date parsing
data[['release_date']] = data_orig['release_date'].map(parser.parse)
data['release_year'] = pd.DatetimeIndex(data_orig['release_date']).year
data['release_month'] = pd.DatetimeIndex(data_orig['release_date']).month_name()
data['release_day'] = pd.DatetimeIndex(data_orig['release_date']).day_name()

# date parser interprets 21 to 68 as 2021 to 2068 --> fix
data.loc[data.release_year >= 2021, 'release_year'] = data.loc[data.release_year >= 2021, 'release_year'] - 100

In [4]:
data['log_budget'] = np.log1p(data.budget)
data['log_popularity'] = np.log1p(data.popularity)

has_top_keyword_df, top_keywords = extract_has_top_keyword(data_orig)
data = data.merge(has_top_keyword_df, on=['id'], how='left')

In [5]:
age_df = getTimeFeatures(data_orig)
data = data.merge(age_df, on=['id'], how='left')

bin_features_df = getBinaryFeatures(data_orig)
data = data.merge(bin_features_df, on=['id'], how='left')

  year[year>2020] = year[year>2020]-100
  self._update_inplace(new_data)


In [6]:
features = [
    'log_budget', 'log_popularity', 'runtime', 'has_top_keyword',
    'release_year', 'release_day', 'age',
    'hashomepage', 'isinCollection', 'zeroBudget'
]
target = 'revenue'

In [7]:
data[features].head(3)

Unnamed: 0,log_budget,log_popularity,runtime,has_top_keyword,release_year,release_day,age,hashomepage,isinCollection,zeroBudget
0,16.455,2.025,93.0,True,2015,Friday,2,False,True,False
1,17.504,2.225,113.0,False,2004,Friday,13,False,True,False
2,15.009,4.179,105.0,False,2014,Friday,3,True,False,False


---
## Model Training

In [17]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

In [18]:
lgbmodel = lgb.LGBMRegressor(n_estimators=1000, 
                             objective='regression', 
                             metric='rmse',
                             max_depth = 4,
                             num_leaves=30, 
                             min_child_samples=100,
                             learning_rate=0.01,
                             boosting = 'gbdt',
                             min_data_in_leaf= 10,
                             feature_fraction = 0.9,
                             bagging_freq = 1,
                             bagging_fraction = 0.9,
                             importance_type='gain',
                             lambda_l1 = 0.2,
                             subsample=.8, 
                             colsample_bytree=.9,
                             use_best_model=True)

fit_params={"early_stopping_rounds": 1000, "verbose": 100, "eval_metric": "rmse"}
lgbmodel.fit(X_train, np.log1p(y_train), eval_set=[(X_test, np.log1p(y_test))], **fit_params)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields release_day

------
## Prediction and Submission

In [19]:
test_data = pd.read_csv('./data/test.csv')

In [20]:
test_data['log_budget'] = np.log1p(test_data.budget)
test_data['log_popularity'] = np.log1p(test_data.popularity)
test_data['has_top_keyword'] = test_data["Keywords"].apply(has_top_keyword, args=(top_keywords,))


In [30]:
test_features = test_data[features]
predictions = lgbmodel.predict(test_features)
test_data['revenue'] = np.expm1(predictions)
test_data[['id', 'revenue']].head()

Unnamed: 0,id,revenue
0,3001,490417.243
1,3002,1927747.897
2,3003,2900124.437
3,3004,13400322.229
4,3005,2233082.624
5,3006,3705631.853
6,3007,1491595.232
7,3008,40213281.415
8,3009,28362216.871
9,3010,214772559.578
