In [1]:
import pandas as pd
import numpy as np
from dateutil import parser

from common.features import extract_has_top_keyword, has_top_keyword, getTimeFeatures, getBinaryFeatures
import lightgbm as lgb
from sklearn.model_selection import train_test_split

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
data_orig = pd.read_csv('./data/train.csv')
data = data_orig.copy()

In [4]:
data['log_budget'] = np.log1p(data.budget)
data['log_popularity'] = np.log1p(data.popularity)

has_top_keyword_df, top_keywords = extract_has_top_keyword(data_orig)
data = data.merge(has_top_keyword_df, on=['id'], how='left')

In [5]:
age_df = getTimeFeatures(data_orig)
data = data.merge(age_df, on=['id'], how='left')

bin_features_df = getBinaryFeatures(data_orig)
data = data.merge(bin_features_df, on=['id'], how='left')

  year[year>2020] = year[year>2020]-100
  self._update_inplace(new_data)


In [8]:
features = [
    'log_budget', 'log_popularity', 'runtime', 'has_top_keyword',
    'year', 'day', 'age',
    'hashomepage', 'isinCollection', 'zeroBudget'
]
target = 'revenue'

In [9]:
data[features].head(3)

Unnamed: 0,log_budget,log_popularity,runtime,has_top_keyword,year,day,age,hashomepage,isinCollection,zeroBudget
0,16.455,2.025,93.0,True,2015,4,2,False,True,False
1,17.504,2.225,113.0,False,2004,4,13,False,True,False
2,15.009,4.179,105.0,False,2014,4,3,True,False,False


---
## Model Training

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

In [11]:
lgbmodel = lgb.LGBMRegressor(n_estimators=1000, 
                             objective='regression', 
                             metric='rmse',
                             max_depth = 4,
                             num_leaves=30, 
                             min_child_samples=100,
                             learning_rate=0.01,
                             boosting = 'gbdt',
                             min_data_in_leaf= 10,
                             feature_fraction = 0.9,
                             bagging_freq = 1,
                             bagging_fraction = 0.9,
                             importance_type='gain',
                             lambda_l1 = 0.2,
                             subsample=.8, 
                             colsample_bytree=.9,
                             use_best_model=True)

fit_params={"early_stopping_rounds": 1000, "verbose": 100, "eval_metric": "rmse"}
lgbmodel.fit(X_train, np.log1p(y_train), eval_set=[(X_test, np.log1p(y_test))], **fit_params)

Training until validation scores don't improve for 1000 rounds.
[100]	valid_0's rmse: 2.27067
[200]	valid_0's rmse: 2.12339
[300]	valid_0's rmse: 2.09024
[400]	valid_0's rmse: 2.07951
[500]	valid_0's rmse: 2.07117
[600]	valid_0's rmse: 2.05989
[700]	valid_0's rmse: 2.05361
[800]	valid_0's rmse: 2.04995
[900]	valid_0's rmse: 2.04996
[1000]	valid_0's rmse: 2.0493
Did not meet early stopping. Best iteration is:
[981]	valid_0's rmse: 2.04854


LGBMRegressor(bagging_fraction=0.9, bagging_freq=1, boosting='gbdt',
       boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       feature_fraction=0.9, importance_type='gain', lambda_l1=0.2,
       learning_rate=0.01, max_depth=4, metric='rmse',
       min_child_samples=100, min_child_weight=0.001, min_data_in_leaf=10,
       min_split_gain=0.0, n_estimators=1000, n_jobs=-1, num_leaves=30,
       objective='regression', random_state=None, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=0, use_best_model=True)

------
## Prediction and Submission

In [16]:
test_data = pd.read_csv('./data/test.csv')

In [17]:
test_data['log_budget'] = np.log1p(test_data.budget)
test_data['log_popularity'] = np.log1p(test_data.popularity)
test_data['has_top_keyword'] = test_data["Keywords"].apply(has_top_keyword, args=(top_keywords,))

age_df = getTimeFeatures(test_data)
test_data = test_data.merge(age_df, on=['id'], how='left')

bin_features_df = getBinaryFeatures(test_data)
test_data = test_data.merge(bin_features_df, on=['id'], how='left')

  year[year>2020] = year[year>2020]-100


In [18]:
test_features = test_data[features]
predictions = lgbmodel.predict(test_features)
test_data['revenue'] = np.expm1(predictions)
test_data[['id', 'revenue']].head()

Unnamed: 0,id,revenue
0,3001,787916.444
1,3002,4408762.661
2,3003,3538434.073
3,3004,5916512.442
4,3005,2473452.963
