In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
import time
from datetime import timedelta, date
from fastai.tabular import *

%matplotlib inline

In [2]:
train_df = pd.read_csv('train.csv')
test_df =  pd.read_csv('test.csv')

In [3]:
train = pd.DataFrame(train_df.groupby(['application_date', 'segment'])['case_count'].sum())
train.reset_index(inplace=True)

In [4]:
add_datepart(train, 'application_date', drop = False)
add_cyclic_datepart(train, 'application_date')

add_datepart(test_df, 'application_date', drop = False)
add_cyclic_datepart(test_df, 'application_date')

Unnamed: 0,id,segment,application_Year,application_Month,application_Week,application_Day,application_Dayofweek,application_Dayofyear,application_Is_month_end,application_Is_month_start,...,application_Is_year_start,application_Elapsed,application_weekday_cos,application_weekday_sin,application_day_month_cos,application_day_month_sin,application_month_year_cos,application_month_year_sin,application_day_year_cos,application_day_year_sin
0,1,1,2019,7,27,6,5,187,False,False,...,False,1562371200,-0.222521,-0.974928,0.528964,0.848644,-1.000000e+00,1.224647e-16,-0.998186,-0.060213
1,2,1,2019,7,27,7,6,188,False,False,...,False,1562457600,0.623490,-0.781831,0.347305,0.937752,-1.000000e+00,1.224647e-16,-0.997001,-0.077386
2,3,1,2019,7,28,8,0,189,False,False,...,False,1562544000,1.000000,0.000000,0.151428,0.988468,-1.000000e+00,1.224647e-16,-0.995521,-0.094537
3,4,1,2019,7,28,9,1,190,False,False,...,False,1562630400,0.623490,0.781831,-0.050649,0.998717,-1.000000e+00,1.224647e-16,-0.993747,-0.111659
4,5,1,2019,7,28,10,2,191,False,False,...,False,1562716800,-0.222521,0.974928,-0.250653,0.968077,-1.000000e+00,1.224647e-16,-0.991677,-0.128748
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,176,2,2019,10,42,20,6,293,False,False,...,False,1571529600,0.623490,-0.781831,-0.758758,-0.651372,-1.836970e-16,-1.000000e+00,0.309017,-0.951057
176,177,2,2019,10,43,21,0,294,False,False,...,False,1571616000,1.000000,0.000000,-0.612106,-0.790776,-1.836970e-16,-1.000000e+00,0.325342,-0.945596
177,178,2,2019,10,43,22,1,295,False,False,...,False,1571702400,0.623490,0.781831,-0.440394,-0.897805,-1.836970e-16,-1.000000e+00,0.341571,-0.939856
178,179,2,2019,10,43,23,2,296,False,False,...,False,1571788800,-0.222521,0.974928,-0.250653,-0.968077,-1.836970e-16,-1.000000e+00,0.357698,-0.933837


In [5]:
train_x = train.drop(['case_count'],axis = 1)
train_y = train['case_count']

In [6]:
from sklearn.preprocessing import LabelEncoder
cat_cols = ['application_Year', 'application_Month', 'application_Week',
       'application_Day', 'application_Dayofweek', 'application_Dayofyear',
       'application_Is_month_end', 'application_Is_month_start',
       'application_Is_quarter_end', 'application_Is_quarter_start',
       'application_Is_year_end', 'application_Is_year_start']
le = LabelEncoder()
for col in cat_cols:
    train_x[col] = train_x[col].astype(str)
    test_df[col] = test_df[col].astype(str)
    le.fit(train_x[col])
    train_x[col] = le.transform(train_x[col])
    test_df[col] = le.transform(test_df[col])
    

In [7]:
train_x_early, valid_x_early, train_y_early, valid_y_early = train_test_split(train_x,train_y,test_size = 0.2, 
                                                                              random_state = 3911, 
                                                                             stratify = train_x['segment'])

In [9]:
from lightgbm import LGBMRegressor

gbm=LGBMRegressor(
        n_estimators=50,
        learning_rate=0.08,
        num_leaves=21,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.073,
        min_split_gain=0.0222415,
        min_child_weight=20,
        categorical_features = cat_cols
)

gbm.fit(train_x_early, train_y_early)

LGBMRegressor(boosting_type='gbdt',
              categorical_features=['application_Year', 'application_Month',
                                    'application_Week', 'application_Day',
                                    'application_Dayofweek',
                                    'application_Dayofyear',
                                    'application_Is_month_end',
                                    'application_Is_month_start',
                                    'application_Is_quarter_end',
                                    'application_Is_quarter_start',
                                    'application_Is_year_end',
                                    'application_Is_year_start'],
              class_weight=None, colsample_bytree=0.9497036,
              importance_type='split', learning_rate=0.08, max_depth=8,
              min_child_samples=20, min_child_weight=20,
              min_split_gain=0.0222415, n_estimators=50, n_jobs=-1,
              num_leaves=21, objective=

In [10]:
train_pred_early = gbm.predict(train_x_early)
train_pred_valid = gbm.predict(valid_x_early)

In [11]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print('Train mape: {}'.format(mean_absolute_percentage_error(train_y_early,train_pred_early)))
print('Valid mape: {}'.format(mean_absolute_percentage_error(valid_y_early,train_pred_valid)))

Train mape: 333.4102640486241
Valid mape: 750.8713870561056


In [13]:
test = test_df.drop(['id'], axis = 1)
test_pred = gbm.predict(test)

In [14]:
sub = pd.read_csv('sample_submission.csv')
sub['case_count'] = test_pred
sub.to_csv('gbm1.csv',index = False)

In [15]:
gbm.feature_importances_

array([ 45,  16,  35,  46,  65, 100,  61,  35,   0,   0,   0,   0,   0, 226,  26,  16, 102, 112,   4,  10,  32,  68])

In [None]:
train_x.columns