# IMPORTS

In [15]:
import os
import gc
import random
import pandas as pd
import numpy as np
import lightgbm as lgb

from tqdm.notebook import tqdm
from tqdm import tqdm_notebook

from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.simplefilter('ignore')

# **Load - Process**

In [10]:
class Process :

  def LAG(self,data,LagFeature,shift=1,NewFeatures=[]) :
    data[NewFeatures[0]]   = data[LagFeature]  - data[LagFeature].shift(shift)
    data[NewFeatures[1]]   = data[LagFeature].shift(shift)

  def LE(self,data,LE_cols = []) :
    LE = LabelEncoder()
    for le_col in LE_cols :
      data[le_col] = LE.fit_transform(data[le_col])
  
  def process(self,train,test) :
    data = pd.concat([train,test]).reset_index(drop=True)
    
    # time features 
    data['created_at'] = pd.to_datetime(data['Datetime'])
    data['year'] = data['created_at'].dt.year
    data['year'] = data['year'].astype(float)
    data['month'] = data['created_at'].dt.month
    data['day'] = data['created_at'].dt.day
    data['weekday'] = data['created_at'].dt.weekday
    data['weekofyear'] = data['created_at'].dt.weekofyear
    data['hour'] = data['created_at'].dt.hour

    # combination between time features
    data['day_hour']  = data['day'].astype(str) + '-' + data['hour'].astype(str)
    data['month_day'] = data['month'].astype(str) + '-' + data['day'].astype(str)

    # Label Encoder
    self.LE(data,LE_cols = ['day_hour','month_day'])

    # FE
    data = data.sort_values('Datetime').reset_index(drop=True)
    self.LAG(data,LagFeature='Sensor1_PM2.5',shift=1,NewFeatures=['sensor1_diff_next','sensor1_next'])
    self.LAG(data,LagFeature='Sensor1_PM2.5',shift=-1,NewFeatures=['sensor1_diff_before','sensor1_before'])
    self.LAG(data,LagFeature='Sensor2_PM2.5',shift=1,NewFeatures=['sensor2_diff_next','sensor2_next'])
    self.LAG(data,LagFeature='Sensor2_PM2.5',shift=-1,NewFeatures=['sensor2_diff_before','sensor2_before'])

    # Get our New Train,Test
    data['SplitBy']   = data['year'].astype(int).astype(str) + '-' + data['month'].astype(str) + '-' + data['day'].astype(str)
    data = data.sort_values('SplitBy').reset_index(drop=True)
    train = data[data['ID'].isin(train['ID'].values)].reset_index(drop=True)
    train['Offset_fault'] = train['Offset_fault'].astype('int')
    test = data[~data['ID'].isin(train['ID'].values)].reset_index(drop=True)

    return train, test

In [11]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [12]:
processor = Process()
train, test= processor.process(train,test)

In [13]:
print('unique days',train.SplitBy.nunique() , test.SplitBy.nunique())
print('shape',train.shape , test.shape)

unique days 99 52
shape (297177, 25) (127361, 25)


# **Modeling**

In [26]:
class CFG :
  SEED = 42
  n_splits = 5

  lgb_params = {'boosting_type': 'gbdt','objective': 'binary','metric': 'auc',
                'n_estimators': 2500,'reg_lambda' :50,'num_leaves' : 81,
                'seed': SEED,'silent':True,'early_stopping_rounds': 100,
               }
  remove_features = ['ID', 'Datetime',"created_at",'SplitBy' , 'folds', 'Offset_fault',]
  TARGET_COL = 'Offset_fault'

In [27]:
class CostumSplit :
  def __init__(self,) :
    self.n_splits = CFG.n_splits

  def Split(self,Train_) :
    kf = GroupKFold(n_splits=self.n_splits)

    Train = Train_.copy()
    Train = Train.drop_duplicates('SplitBy').reset_index(drop=True)
    
    groups = Train['SplitBy']
    Train["folds"]=-1   
    for fold, (_, val_index) in enumerate(kf.split(Train,Train['Offset_fault'],groups)):
          Train.loc[val_index, "folds"] = fold
    return Train

  def apply(self,train) :
    mapper = dict(zip(self.Split(train)['SplitBy'].tolist(),
                      self.Split(train)['folds'].tolist()))

    train['folds'] = train['SplitBy'].map(mapper)
    return train

In [28]:
split = CostumSplit() 

train = split.apply(train)

In [29]:
features_columns = [col for col in train.columns if col not in CFG.remove_features]
len(features_columns)

20

In [33]:
oof_lgb = np.zeros((train.shape[0],))
test[CFG.TARGET_COL] = 0
lgb_preds = []

for fold in range(CFG.n_splits) :
    print(50*'-')
    print(f'Fold {fold+1} / {CFG.n_splits}' )

    tr_x, tr_y = train[train['folds']!=fold][features_columns] , train[train['folds']!=fold][CFG.TARGET_COL] 
    vl_x, vl_y = train[train['folds']==fold][features_columns] , train[train['folds']==fold][CFG.TARGET_COL] 
    val_idx = vl_x.index.tolist()

    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=vl_y)

    estimator = lgb.train(CFG.lgb_params,train_data,valid_sets = [train_data,valid_data],verbose_eval = 0)
    
    y_pred_val = estimator.predict(vl_x,num_iteration=estimator.best_iteration)
    oof_lgb[val_idx] = y_pred_val
    print(f'FOLD-{fold} accuracy score :',accuracy_score(vl_y, (y_pred_val>0.5)*1))

    y_pred_test = estimator.predict(test[features_columns],num_iteration=estimator.best_iteration)
    lgb_preds.append(y_pred_test)
    print(50*'-')

print('OOF score :',accuracy_score(train[CFG.TARGET_COL], (oof_lgb>0.5)*1))

--------------------------------------------------
Fold 1 / 5
FOLD-0 accuracy score : 0.9912406059281427
--------------------------------------------------
--------------------------------------------------
Fold 2 / 5
FOLD-1 accuracy score : 0.9918887601390498
--------------------------------------------------
--------------------------------------------------
Fold 3 / 5
FOLD-2 accuracy score : 0.9920260595161345
--------------------------------------------------
--------------------------------------------------
Fold 4 / 5
FOLD-3 accuracy score : 0.9960781662046115
--------------------------------------------------
--------------------------------------------------
Fold 5 / 5
FOLD-4 accuracy score : 0.9951137552077083
--------------------------------------------------
OOF score : 0.9932531790818266


# **SUBMISSION**

In [32]:
SUB_FILE_NAME = 'WinningSolution.csv' ;sub_df = test[['ID']].copy() ; sub_df['Offset_fault'] = (np.mean(lgb_preds,axis=0)>0.5)*1
sub_df.to_csv(SUB_FILE_NAME, index=False)
sub_df.head(10)

Unnamed: 0,ID,Offset_fault
0,ID_VJTCP5667QNH,0
1,ID_Z4FVLMBG5SI8,0
2,ID_1AKWB2POZX8Q,0
3,ID_MD0HNZQZT1FQ,1
4,ID_HJ7XVHB2GBFK,0
5,ID_8GT0DMK2ZO33,0
6,ID_M5Z3J91KLW8A,1
7,ID_I4C5C9NCPXZY,0
8,ID_R8WE3U29LXY4,1
9,ID_98KEGPPXVOQU,0
