In [29]:
import pandas as pd
import numpy as np
from common.features import extract_has_top_keyword, has_top_keyword
import lightgbm as lgb
from sklearn.model_selection import train_test_split

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [13]:
orig_data = pd.read_csv('./data/train.csv')
data = orig_data.copy()

In [14]:
data['log_budget'] = np.log1p(data.budget)
data['log_popularity'] = np.log1p(data.popularity)

has_top_keyword_df, top_keywords = extract_has_top_keyword(orig_data)
data = data.merge(has_top_keyword_df, on=['id'], how='left')

In [15]:
features = [
    'log_budget', 'log_popularity', 'runtime', 'has_top_keyword'
]
target = 'revenue'

In [16]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

In [17]:
lgbmodel = lgb.LGBMRegressor(n_estimators=1000, 
                             objective='regression', 
                             metric='rmse',
                             max_depth = 4,
                             num_leaves=30, 
                             min_child_samples=100,
                             learning_rate=0.01,
                             boosting = 'gbdt',
                             min_data_in_leaf= 10,
                             feature_fraction = 0.9,
                             bagging_freq = 1,
                             bagging_fraction = 0.9,
                             importance_type='gain',
                             lambda_l1 = 0.2,
                             subsample=.8, 
                             colsample_bytree=.9,
                             use_best_model=True)

fit_params={"early_stopping_rounds": 1000, "verbose": 100, "eval_metric": "rmse"}
lgbmodel.fit(X_train, np.log1p(y_train), eval_set=[(X_test, np.log1p(y_test))], **fit_params)

Training until validation scores don't improve for 1000 rounds.
[100]	valid_0's rmse: 2.3258
[200]	valid_0's rmse: 2.17768
[300]	valid_0's rmse: 2.14791
[400]	valid_0's rmse: 2.13551
[500]	valid_0's rmse: 2.13296
[600]	valid_0's rmse: 2.13401
[700]	valid_0's rmse: 2.13487
[800]	valid_0's rmse: 2.135
[900]	valid_0's rmse: 2.13572
[1000]	valid_0's rmse: 2.13697
Did not meet early stopping. Best iteration is:
[420]	valid_0's rmse: 2.13234


LGBMRegressor(bagging_fraction=0.9, bagging_freq=1, boosting='gbdt',
       boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       feature_fraction=0.9, importance_type='gain', lambda_l1=0.2,
       learning_rate=0.01, max_depth=4, metric='rmse',
       min_child_samples=100, min_child_weight=0.001, min_data_in_leaf=10,
       min_split_gain=0.0, n_estimators=1000, n_jobs=-1, num_leaves=30,
       objective='regression', random_state=None, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=0, use_best_model=True)

------

In [19]:
test_data = pd.read_csv('./data/test.csv')

In [20]:
test_data['log_budget'] = np.log1p(test_data.budget)
test_data['log_popularity'] = np.log1p(test_data.popularity)
test_data['has_top_keyword'] = test_data["Keywords"].apply(has_top_keyword, args=(top_keywords,))


In [30]:
test_features = test_data[features]
predictions = lgbmodel.predict(test_features)
test_data['revenue'] = np.expm1(predictions)
test_data[['id', 'revenue']]

Unnamed: 0,id,revenue
0,3001,490417.243
1,3002,1927747.897
2,3003,2900124.437
3,3004,13400322.229
4,3005,2233082.624
5,3006,3705631.853
6,3007,1491595.232
7,3008,40213281.415
8,3009,28362216.871
9,3010,214772559.578
