In [1]:
## Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import warnings

In [2]:
sns.set(style = 'darkgrid', context = 'notebook', palette = 'deep')
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
## Read Data
X_train_full = pd.read_csv('train.csv', index_col='id')
y_train_full = X_train_full['output_gen']
X_train_full.drop(columns=['output_gen'], axis=0, inplace=True)
X_test = pd.read_csv('test.csv', index_col='id')
test_ids = X_test.index
X_train_full.head()

Unnamed: 0_level_0,obs_day,obs_hour,obs_minute,C_motion,fw_motion,faucet_hole,vap_pressure,vap_enth,vap_pressure_div,vap_motion,fw_enth,vap_temp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
a563699ca2a601c6ac64aa29986a00a90fb42b48741695b0526a286d504d17ca,Saturday,5.0,46.0,361.486145,2542.801078,0.666879,24.138243,2788.168701,25.448248,2543.177002,1270.06604,603.834106
91ab3eb3bcf6c8c1c5fe2da9ba671aa5a48c7369d9a50f32e1ddd735472b4b3c,Saturday,18.0,,197.064667,1454.458144,,12.702796,,13.405114,1454.421021,1095.314453,500.178772
7128c51c554735d6c81862684ad6005ae12d2edbcd464487a7217fc72c03ba22,Saturday,3.0,51.0,356.869232,2458.67169,0.678685,,2784.910889,,2456.578369,1262.656982,603.529663
c8144b52e4f63014de0a0d8e1c629bf0b05cb2696cfc23291b4f48e6491c4cb5,Saturday,13.0,,239.267517,1628.127295,0.679137,15.378051,2881.876709,16.252741,1626.861328,1148.203857,593.649658
88d15a5b2df6692f23d105ff1ae82ae026be00c9271eef33e0aea97fd2110cb6,Friday,11.0,13.0,,2621.829401,0.676403,24.517698,2787.063232,,2621.653564,1280.567383,602.601501


In [4]:
X_train_full['obs_day'] = X_train_full['obs_day'].map({'Saturday' : 0, 'Friday' : 1})
X_test['obs_day'] = X_test['obs_day'].map({'Saturday' : 0, 'Friday' : 1})

In [5]:
X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72000 entries, a563699ca2a601c6ac64aa29986a00a90fb42b48741695b0526a286d504d17ca to ec8aad6fe010eb3aba624a82afbce763c6c2cc31ba45069195ebbc245674d5e2
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   obs_day           64163 non-null  float64
 1   obs_hour          64030 non-null  float64
 2   obs_minute        64148 non-null  float64
 3   C_motion          64089 non-null  float64
 4   fw_motion         64088 non-null  float64
 5   faucet_hole       64085 non-null  float64
 6   vap_pressure      64069 non-null  float64
 7   vap_enth          64179 non-null  float64
 8   vap_pressure_div  64197 non-null  float64
 9   vap_motion        64108 non-null  float64
 10  fw_enth           64157 non-null  float64
 11  vap_temp          64112 non-null  float64
dtypes: float64(12)
memory usage: 7.1+ MB


In [6]:
X_train_full.isna().sum()

obs_day             7837
obs_hour            7970
obs_minute          7852
C_motion            7911
fw_motion           7912
faucet_hole         7915
vap_pressure        7931
vap_enth            7821
vap_pressure_div    7803
vap_motion          7892
fw_enth             7843
vap_temp            7888
dtype: int64

In [7]:
X_test.isna().sum()

obs_day             5259
obs_hour            5119
obs_minute          5248
C_motion            5321
fw_motion           5313
faucet_hole         5294
vap_pressure        5346
vap_enth            5348
vap_pressure_div    5316
vap_motion          5379
fw_enth             5337
vap_temp            5191
dtype: int64

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1, random_state=42) 

In [None]:
## Imputation
num_cols = X_train_full.columns[1:]
cat_cols = X_train_full.columns[:1]
imp_num = IterativeImputer(estimator=RandomForestRegressor(),
                           initial_strategy='mean',
                           max_iter=10, random_state=0)
imp_cat = IterativeImputer(estimator=RandomForestClassifier(), 
                           initial_strategy='most_frequent',
                           max_iter=10, random_state=0)

X_train[num_cols] = imp_num.fit_transform(X_train[num_cols])
X_train[cat_cols] = imp_num.fit_transform(X_train[cat_cols])

X_val[num_cols]  = imp_num.transform(X_val[num_cols])
X_val[cat_cols]  = imp_num.transform(X_val[cat_cols])
X_test[num_cols] = imp_num.transform(X_test[num_cols])
X_test[cat_cols] = imp_num.transform(X_test[cat_cols])

In [None]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 100000
}
model = lgb.LGBMRegressor(**hyper_params)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='l1', early_stopping_rounds=1000)

In [None]:
## Validation MAE
## Best Mae : 0.835
preds = model.predict(X_val, num_iteration=model.best_iteration_)
print(mean_absolute_error(preds, y_val))

In [None]:
## Final Predictions
preds_test = model.predict(X_test)
output = pd.DataFrame({'id': test_ids, 'output_gen': preds_test})
output.to_csv('submission.csv', index=False)

In [None]:
output.head()