In [131]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import geopy.distance

import lightgbm as lgb
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit,StratifiedKFold
import itertools
from catboost import CatBoostRegressor
import seaborn as sb
from sklearn.preprocessing import LabelEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.metrics import roc_auc_score,accuracy_score ,confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [132]:
train = pd.read_csv('/kaggle/input/demand-forecast/train.csv')
test = pd.read_csv('/kaggle/input/demand-forecast/test.csv')
sub = pd.read_csv('/kaggle/input/weeksub/submission_weekly.csv')

In [133]:
# Below is the function for target encoding

def tar_encoder(train_df,test_df):
    TE_encoder = TargetEncoder()
    train_df[cat_cols] = TE_encoder.fit_transform(train_df[cat_cols], train_df['daily_dispatch_count'])
    test_df[cat_cols] = TE_encoder.transform(test_df[cat_cols])
    return train_df,test_df

In [134]:
# Below is the function for label encoding

def lab_encoder(df,cat_cols):
    le = LabelEncoder()
    le_list = []
    for col in cat_cols:
        df[col]= df[col].astype('str')
        df[col]= le.fit_transform(df[col])
        col = dict(zip(le.classes_, le.transform(le.classes_)))
        le_list.append(col)
    return df,le_list

In [135]:
print('train shape: ',train.shape,'test shape: ',test.shape)

In [136]:
# Concatenating for preprocessing
train['is_train'] = 1
test['is_train'] = 0
df = pd.concat([train,test]).reset_index()

#### Missing values in dataset

In [137]:
# Missing values
df.isnull().sum()*100/df.shape[0]

In [138]:
df.head()

# EDA for preprocessing

### Visualization

In [139]:
# warehouse_ID distribution
df['warehouse_ID'].value_counts().plot(kind='bar')

In [140]:
# Product_Type distribution
df['Product_Type'].value_counts().plot(kind='bar')

In [141]:
# is_warehouse_closed distribution
df['is_warehouse_closed'].value_counts().plot(kind='bar')

In [142]:
# Warehouse dispatch counts
df.groupby(['warehouse_ID'])['daily_dispatch_count'].sum().plot(kind='bar')

WH_0x3ea is the warehouse which has the most dispatches

### Computing missing values and fixing noise

Data is having wrong values for dispatch count when warehouse is closed since dispatch count should always be zero when warehouse is closed. 

In [143]:
# Fixing the values where warehouse is closed but dispatch count is non zero
for idx in df.loc[df['is_warehouse_closed']=='Yes']['daily_dispatch_count'].index:
    if df.loc[idx]['daily_dispatch_count'] != 0:
        df.at[idx,'daily_dispatch_count'] = 0

Missing longitudes and latitudes are computed using other values from dataset where for particular warehouse_id the values for longitudes and latitudes are already available

In [144]:
# Dictionary for longitude and latitude
ware_long_lati = df.groupby(['warehouse_ID'])[['Latitude','Longitude']].agg(pd.Series.mode)
ware_lat_dic = {}
ware_long_dic = {}
keys = ware_long_lati.index
values = ware_long_lati.values
for i in range(0,len(keys)):
    ware_lat_dic[keys[i]] = values[i][0]
    ware_long_dic[keys[i]] = values[i][1]
    
# Missing Latitude computation
MissIdxLatitude = df['Latitude'][df['Latitude'].isnull()].index
for i in range(0,len(MissIdxLatitude)):
        df.at[MissIdxLatitude[i],'Latitude'] = ware_lat_dic[df.iloc[MissIdxLatitude[i]]['warehouse_ID']]
        
# Missing Longitude computation
MissIdxLongitude = df['Longitude'][df['Longitude'].isnull()].index
for i in range(0,len(MissIdxLongitude)):
        df.at[MissIdxLongitude[i],'Longitude'] = ware_long_dic[df.iloc[MissIdxLongitude[i]]['warehouse_ID']]

Fixing the weekends value where data is marking non weekend as weekend and also computing the missing values of is_weekend column using the date provided.

In [145]:
# Missing weekends computation & fixing incorrect values
df['date'] = pd.to_datetime(df['date'])
df = df.drop(['is_weekend'],axis=1)
Weekend_y_n = np.where(df['date'].dt.day_name().isin(['Sunday']),'Yes','No')
df['is_weekend'] = Weekend_y_n

Missing values of is_warehouse_closed using the logic that on the particular date are most of the warehouses closed or not.

In [146]:
# Missing values computation for is_warehouse_closed

df['date'] = pd.to_datetime(df['date'])
date_closed = df.groupby(['date'])[['is_warehouse_closed']].agg(pd.Series.mode)

date_closed_dic = {}
keys = date_closed.index
values = date_closed['is_warehouse_closed'].values
for i in range(0,len(keys)):
    date_closed_dic[keys[i]] = values[i]
# Missing closed or not
MissIdxclosed = df['is_warehouse_closed'][df['is_warehouse_closed'].isnull()].index
for i in range(0,len(MissIdxclosed)):
        df.at[MissIdxclosed[i],'is_warehouse_closed'] = date_closed_dic[df.iloc[MissIdxclosed[i]]['date']]
    

Adding week's number for a particular month 

In [147]:
# New feature: Week number for the month
df['week'] = df['date'].apply(lambda date: (date.day-1) // 7 + 1)

In [148]:
# Correlation matrix

corr = df.corr()
sb.heatmap(corr, cmap="Blues", annot=True)

From above it can be observed that daily dispatch and weekly dispatch are highly correlated but this information is not much useful for model buidling

#### Checking missing values

In [149]:
# Check missing values
df.isnull().sum()*100/df.shape[0]

In [150]:
df.head()

# Preparing & training the model

In [151]:
# Get train and test back
train = df[df['is_train']==1]
test = df[df['is_train']==0]

In [152]:
# Save ID
id_ = test['ID']

# Drop uneccesary features
test = test.drop(['ID','index','daily_dispatch_count','weekly_dispatch_count','is_train'],axis=1)

In [153]:
# Getting dataset ready for model
Y = train['daily_dispatch_count']
X = train.drop(['ID','index','daily_dispatch_count','weekly_dispatch_count','is_train'],axis=1)

# Category columns computation for catboost model
cat_cols = list(set(X.select_dtypes(include = 'object')))

For training catboost model is choosen since:
* Out of Xgboost, LGBM & Catboost, Catboost performed the best. 
* It can handle categorical data.
* It is not impacted by outliers since it is a tree based algorithm.

In [154]:
oof_pred  = np.zeros((len(train),))     # For out of fold prediction 

y_pred_final   = np.zeros((len(test),)) # For test set prediction
n_splits  = 8
test_rmse_score = []
test_r2_score = []

# Initialized Kfold object
kf=KFold(n_splits=n_splits,shuffle=True,random_state=2062021)

# Training model
for i,(train_idx,val_idx) in enumerate(kf.split(X,Y)):  

    
    X_train, y_train = X.iloc[train_idx,:], Y.iloc[train_idx]

    X_val, y_val = X.iloc[val_idx, :], Y.iloc[val_idx]


    print('\nFold: {}\n'.format(i+1))
    
    model_reg = CatBoostRegressor(iterations=2500, learning_rate=0.01)    
    model_reg.fit(X_train,y_train,early_stopping_rounds=100,cat_features=cat_cols,eval_set=(X_val, y_val),verbose=100)
    testpred1 = model_reg.predict(X_val)
    
    test_r2_score.append(r2_score(y_val, testpred1))
    print("Test r2_score for model 1: %.4f"%(r2_score(y_val, testpred1)))
    
    test_rmse_score.append(mean_squared_error(y_val, testpred1,squared=False))
    print("Test mean_squared_error for model 1: %.4f"%(mean_squared_error(y_val, testpred1,squared=False)))
    
    val_pred   = testpred1
    oof_pred[val_idx] = val_pred
    
    y_pred_final += model_reg.predict(test)/(n_splits)
    print('\n')

print('OOF r2_score:- ',(r2_score(Y,oof_pred)))
print('OOF mean_squared_error:- ',(mean_squared_error(Y,oof_pred,squared=False)))


### Feature importance

In [155]:
model_reg.get_feature_importance(prettified=True)

### Preparing the data for predictions 

In [156]:
# Adding IDs back and prediction made by model
test['daily_dispatch_count'] = y_pred_final
test['ID'] = id_

In [157]:
# Getting week group for weekly dispatch calculation
flag = []
test['weekend_groupby'] = -1
for ware_id in test['warehouse_ID'].unique():
    for type_ in test[test['warehouse_ID']==ware_id]['Product_Type'].unique():
        count = 0
        for i in test[(test['warehouse_ID']==ware_id) & (test['Product_Type']==type_)].sort_values(by='date').index:
            flag.append(count)
            test.at[i,'weekend_groupby'] = count
            if test.loc[i]['is_weekend'] != 'No':
                count += 1    

In [158]:
# Calculating weekly dispatch 
weekly_dispatchFrame = test.groupby(['warehouse_ID','Product_Type','weekend_groupby'])['daily_dispatch_count'].sum().reset_index()
test['weekly_dispatch_count'] = np.nan

for idx in test[test['is_weekend']=='Yes'].index:
    ware_id = test.loc[idx]['warehouse_ID']
    prod_type = test.loc[idx]['Product_Type']
    group = test.loc[idx]['weekend_groupby']
    test.at[idx,'weekly_dispatch_count'] = weekly_dispatchFrame[(weekly_dispatchFrame['warehouse_ID']== ware_id) & (weekly_dispatchFrame['Product_Type']==prod_type) & (weekly_dispatchFrame['weekend_groupby']==group)]['daily_dispatch_count'].values[0]

In [159]:
# Submission file
sub_df = test[test['ID'].isin(sub['ID'])][['ID','weekly_dispatch_count']]
sub_df.to_csv("final_sub.csv", index = False)
sub_df