In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
import lightgbm as lgb
import pickle

In [50]:
train_df = pd.read_csv('./Train.csv')
test_df = pd.read_csv('./Test.csv')

In [15]:
if 'Place_ID' in train_df.columns and 'Date' in train_df.columns:
    train_df = train_df.drop(['Place_ID', 'Date'], axis=1)

In [16]:
if 'Place_ID X Date' in train_df.columns:
    train_df[['Place_ID', 'Date']] = train_df['Place_ID X Date'].str.split(' X ', expand=True)
    idx = train_df.columns.get_loc('Place_ID X Date')
    train_df = train_df.drop('Place_ID X Date', axis=1)
    train_df.insert(idx, 'Date', train_df.pop('Date'))
    train_df.insert(idx, 'Place_ID', train_df.pop('Place_ID'))
    train_df['Date'] = pd.to_datetime(train_df['Date'])
    train_df['Date'] = train_df['Date'].dt.strftime('%m/%d/%y')

In [17]:
place_groups = train_df.groupby('Place_ID')
numeric_cols = train_df.select_dtypes(include=np.number).columns
train_df[numeric_cols] = place_groups[numeric_cols].transform(lambda x: x.fillna(x.median()))
train_df = train_df.fillna(method='ffill')

In [18]:
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['year'] = train_df['Date'].dt.year
train_df['month'] = train_df['Date'].dt.month
train_df['week'] = train_df['Date'].dt.week
train_df['day'] = train_df['Date'].dt.day

  train_df['week'] = train_df['Date'].dt.week


In [19]:
train_df['total_days_month'] = train_df['month'].apply(lambda x: 31 if x==1 else (28+31 if x==2 else (28+31+31 if x==1 else 28+30+31+31))) 
train_df['total_days'] = train_df['total_days_month'] + train_df['day']

In [20]:
X_train = train_df.drop(['Place_ID', 'Date', 'target', 'target_min', 'target_max', 'target_variance', 'target_count'], axis=1)
y_train = train_df['target']

In [61]:
X_trains, X_tests, y_trains, y_tests = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [99]:
params = {
    'objective' :'regression',
    'learning_rate' : 0.03,
    'num_iterations': 50000,
    'max_bins': 50, 
    'max_depth' :7 ,
    'num_leaves' : 70,
    'feature_fraction': 0.64, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'gbdt',
    'metric': 'rmse' ,
     'min_data_in_leaf':5,
    'reg_lambda' :100
}



In [100]:
# lgb_trains = lgb.Dataset(X_trains, y_trains, silent=False,categorical_feature=['year','month','day','week'])
# lgb_evals = lgb.Dataset(X_tests, y_tests, silent=False,categorical_feature=['year','month','day','week'])
lgb_trains = lgb.Dataset(X_train, y_train, silent=False,categorical_feature=['year','month','day','week'])
lgb_evals = lgb.Dataset(X_train, y_train, silent=False,categorical_feature=['year','month','day','week'])
model = lgb.train(params, train_set = lgb_trains, num_boost_round=10000,verbose_eval=500, valid_sets=[lgb_trains,lgb_evals])




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3806
[LightGBM] [Info] Number of data points in the train set: 30557, number of used features: 79
[LightGBM] [Info] Start training from score 61.148045




[500]	training's rmse: 18.4875	valid_1's rmse: 18.4875
[1000]	training's rmse: 13.5458	valid_1's rmse: 13.5458
[1500]	training's rmse: 10.409	valid_1's rmse: 10.409
[2000]	training's rmse: 8.19657	valid_1's rmse: 8.19657
[2500]	training's rmse: 6.57529	valid_1's rmse: 6.57529
[3000]	training's rmse: 5.34267	valid_1's rmse: 5.34267
[3500]	training's rmse: 4.36959	valid_1's rmse: 4.36959
[4000]	training's rmse: 3.59885	valid_1's rmse: 3.59885
[4500]	training's rmse: 2.98342	valid_1's rmse: 2.98342
[5000]	training's rmse: 2.49131	valid_1's rmse: 2.49131
[5500]	training's rmse: 2.08712	valid_1's rmse: 2.08712
[6000]	training's rmse: 1.7552	valid_1's rmse: 1.7552
[6500]	training's rmse: 1.47973	valid_1's rmse: 1.47973
[7000]	training's rmse: 1.25305	valid_1's rmse: 1.25305
[7500]	training's rmse: 1.0639	valid_1's rmse: 1.0639
[8000]	training's rmse: 0.906527	valid_1's rmse: 0.906527
[8500]	training's rmse: 0.774012	valid_1's rmse: 0.774012
[9000]	training's rmse: 0.662363	valid_1's rmse: 0.

In [101]:
place_groups = test_df.groupby('Place_ID')
numeric_cols = test_df.select_dtypes(include=np.number).columns
test_df[numeric_cols] = place_groups[numeric_cols].transform(lambda x: x.fillna(x.median()))
test_df = test_df.fillna(method='ffill')

In [102]:
if 'Place_ID' in test_df.columns and 'Date' in test_df.columns:
    test_df = test_df.drop(['Place_ID', 'Date'], axis=1)
if 'Place_ID X Date' in test_df.columns:
    test_df[['Place_ID', 'Date']] = test_df['Place_ID X Date'].str.split(' X ', expand=True)
    idx = test_df.columns.get_loc('Place_ID X Date')
    test_df.insert(idx, 'Date', test_df.pop('Date'))
    test_df.insert(idx, 'Place_ID', test_df.pop('Place_ID'))
    test_df['Date'] = pd.to_datetime(test_df['Date'])
    test_df['Date'] = test_df['Date'].dt.strftime('%m/%d/%y')

In [103]:
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df['year'] = test_df['Date'].dt.year
test_df['month'] = test_df['Date'].dt.month
test_df['week'] = test_df['Date'].dt.week
test_df['day'] = test_df['Date'].dt.day

  test_df['week'] = test_df['Date'].dt.week


In [104]:
test_df['total_days_month'] = test_df['month'].apply(lambda x: 31 if x==1 else (28+31 if x==2 else (28+31+31 if x==1 else 28+30+31+31))) 
test_df['total_days'] = test_df['total_days_month'] + test_df['day']

In [105]:
X_test = test_df.drop(['Place_ID X Date','Place_ID', 'Date'], axis=1)

In [106]:
test_pred = model.predict(X_test)

In [107]:
output_df = pd.DataFrame({
    'Place_ID X Date': test_df['Place_ID X Date'],
    'target': test_pred
})
output_df.to_csv('predictions.csv', index=False)