## Team - Blaze_Warriors
### Members :- 
### Vivek Rai ([@blazer007](https://www.kaggle.com/blazer007))
### Deepanshu Raj ([@davalpha](https://www.kaggle.com/davalpha))

### Kaggle Notebook Link -  https://www.kaggle.com/blazer007/cascade-cup-22-nb

### Use GPU to run the notebook

In [None]:
import os
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, date

import lightgbm as lgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

In [None]:
def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seed = 3407
seed_everything(seed)

In [None]:
df_train = pd.read_csv("../input/cascade-cup-22/train.csv")

In [None]:
df_test = pd.read_csv("../input/cascade-cup-22/test.csv")
sample_sub = pd.read_csv("../input/cascade-cup-22/sample_submission.csv")

## Data Preprocessing

In [None]:
df_train.columns

In [None]:
df_test.columns

In [None]:
df_train.isna().sum()

In [None]:
df_test.isna().sum()

In [None]:
def daypart(hour):
    if hour in [2,3,4,5]:
        return "dawn"
    elif hour in [6,7,8,9]:
        return "morning"
    elif hour in [10,11,12,13]:
        return "noon"
    elif hour in [14,15,16,17]:
        return "afternoon"
    elif hour in [18,19,20,21]:
        return "evening"
    else: 
        return "midnight"

In [None]:
def create_order_no_column(data):
    data['allot_time'] = data['allot_time'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
    rider_df = []
    for rider in list(data['rider_id'].value_counts().keys()):
        rider_df.append(data[data['rider_id']==rider])
    
    order_number_for_this_day = []
    for ind,rider in enumerate(rider_df):

        curr_list = [1]
        allot_time_list = rider['allot_time'].values
        start = datetime.strptime(str(allot_time_list[0]).split('.')[0],'%Y-%m-%dT%H:%M:%S').date()

        for indx in range(1,rider.shape[0]):

            curr = datetime.strptime(str(allot_time_list[indx]).split('.')[0],'%Y-%m-%dT%H:%M:%S').date()

            if ((curr-start).days) == 0:
                curr_list.append(curr_list[indx-1]+1)
            else:
                curr_list.append(1)
                start = curr

        order_number_for_this_day.append(curr_list)
        
    for i in range(len(rider_df)):
        rider_df[i].reset_index(inplace=True)
        rider_df[i]["order_number_for_this_day"] = pd.DataFrame(np.array(order_number_for_this_day[i]))
        
    data = pd.concat(rider_df, ignore_index=True)
    
    data.sort_values(by = 'index', inplace = True)
    data.reset_index(inplace = True)
    
    data.drop(columns = ["index", "level_0"], inplace = True)
    
    return data

In [None]:
def create_time_deltas_train(df_OS):
    df_OS['mod_order_time'] = pd.DataFrame(np.array([datetime.strptime(i.split()[1],'%H:%M:%S').time() for i in df_OS['order_time'].values]))
    df_OS['mod_allot_time'] = pd.DataFrame(np.array([datetime.strptime(i.split()[1],'%H:%M:%S').time() for i in df_OS['allot_time'].values]))

    accept_time = []
    pickup_time = []

    for elem in df_OS['accept_time'].values:

        if type(elem)==float and math.isnan(elem):
            accept_time.append(np.nan)
        else:
            accept_time.append(datetime.strptime(elem.split()[1], '%H:%M:%S').time())

    for elem in df_OS['pickup_time'].values:

        if type(elem)==float and math.isnan(elem):
            pickup_time.append(np.nan)
        else:
            pickup_time.append(datetime.strptime(elem.split()[1], '%H:%M:%S').time())

    df_OS['mod_accept_time'] = pd.DataFrame(np.array(accept_time))
    df_OS['mod_pickup_time'] = pd.DataFrame(np.array(pickup_time))
    
    mod_order_time = df_OS['mod_order_time'].values
    mod_allot_time = df_OS['mod_allot_time'].values
    mod_accept_time = df_OS['mod_accept_time'].values
    mod_pickup_time = df_OS['mod_pickup_time'].values

    time_delta_1 = []
    time_delta_2 = []
    time_delta_3 = []
    time_delta_4 = []
    
    for indx in range(df_OS.shape[0]):
    
        if(type(mod_allot_time[indx])!=float and type(mod_order_time[indx])!=float):
            time_delta_1.append(round((datetime.combine(date.min, mod_allot_time[indx]) - datetime.combine(date.min,mod_order_time[indx])).seconds/60,2))
        else:
            time_delta_1.append(np.nan)

        if(type(mod_accept_time[indx])!=float and type(mod_allot_time[indx])!=float):
            time_delta_2.append(round((datetime.combine(date.min, mod_accept_time[indx]) - datetime.combine(date.min,mod_allot_time[indx])).seconds/60,2))
        else:
            time_delta_2.append(np.nan)
            
        if(type(mod_pickup_time[indx])!=float and type(mod_accept_time[indx])!=float):
            time_delta_3.append(round((datetime.combine(date.min, mod_pickup_time[indx]) - datetime.combine(date.min,mod_accept_time[indx])).seconds/60,2))
        else:
            time_delta_3.append(np.nan)

        if(type(mod_pickup_time[indx])!=float and type(mod_order_time[indx])!=float):
            time_delta_4.append(round((datetime.combine(date.min, mod_pickup_time[indx]) - datetime.combine(date.min,mod_order_time[indx])).seconds/60,2))
        else:
            time_delta_4.append(np.nan)


    df_OS['TD_1_mins'] = pd.DataFrame(np.array(time_delta_1))
    df_OS['TD_2_mins'] = pd.DataFrame(np.array(time_delta_2))
    df_OS['TD_3_mins'] = pd.DataFrame(np.array(time_delta_3))
    df_OS['TD_4_mins'] = pd.DataFrame(np.array(time_delta_4))
    
    idx = df_OS[ (df_OS['TD_4_mins'] > 34.75) & (df_OS['cancelled'] == 0) ].index
    
    df_OS.drop(idx, inplace = True)
    
    df_OS['TD_2_mins'].fillna(df_OS['TD_2_mins'].median(), inplace = True)
    
    cols = ['mod_order_time', 'mod_allot_time', 'mod_accept_time', 'mod_pickup_time', 'TD_3_mins', 'TD_4_mins']
    
    df_OS.drop(columns = cols, inplace = True)
    
    return "Done"

In [None]:
def create_time_deltas_test(df):
    df_OS = df.copy(deep = True)
    df_OS['mod_order_time'] = pd.DataFrame(np.array([datetime.strptime(i.split()[1],'%H:%M:%S').time() for i in df_OS['order_time'].values]))
    df_OS['mod_allot_time'] = pd.DataFrame(np.array([datetime.strptime(i.split()[1],'%H:%M:%S').time() for i in df_OS['allot_time'].values]))

    accept_time = []

    for elem in df_OS['accept_time'].values:

        if type(elem)==float and math.isnan(elem):
            accept_time.append(np.nan)
        else:
            accept_time.append(datetime.strptime(elem.split()[1], '%H:%M:%S').time())
            
    df_OS['mod_accept_time'] = pd.DataFrame(np.array(accept_time))
    
    mod_order_time = df_OS['mod_order_time'].values
    mod_allot_time = df_OS['mod_allot_time'].values
    mod_accept_time = df_OS['mod_accept_time'].values
    
    time_delta_1 = []
    time_delta_2 = []
    
    for indx in range(df_OS.shape[0]):
    
        if(type(mod_allot_time[indx])!=float and type(mod_order_time[indx])!=float):
            time_delta_1.append(round((datetime.combine(date.min, mod_allot_time[indx]) - datetime.combine(date.min,mod_order_time[indx])).seconds/60,2))
        else:
            time_delta_1.append(np.nan)

        if(type(mod_accept_time[indx])!=float and type(mod_allot_time[indx])!=float):
            time_delta_2.append(round((datetime.combine(date.min, mod_accept_time[indx]) - datetime.combine(date.min,mod_allot_time[indx])).seconds/60,2))
        else:
            time_delta_2.append(np.nan)
            
    return time_delta_1, time_delta_2

In [None]:
def preprocess(df, typee = "train"):
    
    print("Preprocessing Started : ", typee)
    
    if typee == "train":
        for idx in range(len(df)):
            if pd.isna(df["alloted_orders"][idx]) and pd.isna(df["delivered_orders"][idx]) and pd.isna(df["undelivered_orders"][idx]) :
                df.loc[idx, "alloted_orders"] = df.loc[idx, "delivered_orders"] = df.loc[idx, "undelivered_orders"] = 0
            elif pd.isna(df["delivered_orders"][idx]) and pd.isna(df["undelivered_orders"][idx]) :
                df.loc[idx, "delivered_orders"] = 0
                df.loc[idx, "undelivered_orders"] = df["alloted_orders"][idx]
        cols_to_rem_train = ['delivered_time', 'cancelled_time']
        df.drop(columns = cols_to_rem_train, inplace = True)
    else:
        for idx in range(len(df)):
            if pd.isna(df["alloted_orders"][idx]) and pd.isna(df["delivered_orders"][idx]) and pd.isna(df["undelivered_orders"][idx]) :
                df.loc[idx, "alloted_orders"] = df.loc[idx, "delivered_orders"] = df.loc[idx, "undelivered_orders"] = 0
            elif pd.isna(df["delivered_orders"][idx]) and pd.isna(df["undelivered_orders"][idx]) :
                df.loc[idx, "delivered_orders"] = 0
                df.loc[idx, "undelivered_orders"] = df["alloted_orders"][idx]
    
    
    print("Step - 1 Completed !!!")
    
#     if typee == "train":
#         flag = create_time_deltas_train(df)
#     else:
    time_delta_1, time_delta_2 = create_time_deltas_test(df)
    df['TD_1_mins'] = pd.DataFrame(np.array(time_delta_1))
    df['TD_2_mins'] = pd.DataFrame(np.array(time_delta_2))
    df.fillna(df['TD_2_mins'].median(), inplace = True)
    
    print("Step - 2 Completed !!!")
    
    df["lifetime_order_count"].fillna(0, inplace = True)
    df["session_time"].fillna(0, inplace = True)
    df["reassigned_order"].fillna(0, inplace = True)
    
    print("Step - 3 Completed !!!")
    
    df = create_order_no_column(df)
    
    print("Step - 4 Completed !!!")
    
    df.order_time = pd.to_datetime(df.order_time)

    df.allot_time = pd.to_datetime(df.allot_time)

    df.accept_time = pd.to_datetime(df.accept_time)
    
    print("Step - 5 Completed !!!")
    
    if typee == "train":
        df = df[df["accept_time"].notna()]

    df["order_month"] = df.order_time.dt.month
    df["order_hour"] = df.order_time.dt.hour
    df["allot_hour"] = df.allot_time.dt.hour
    df["order_day"] = df.order_time.dt.day_name()
    
    print("Step - 6 Completed !!!")

    order_hour = df.order_time.dt.hour
    df["order_dayparts"] = order_hour.apply(daypart)
    allot_hour = df.allot_time.dt.hour
    df["allot_dayparts"] = allot_hour.apply(daypart)

    day_names = df.order_time.dt.day_name()
    is_weekend = day_names.apply(lambda x : 1 if x in ['Saturday','Sunday'] else 0)
    df["is_weekend"] = is_weekend

    df["accept_day"] = df.accept_time.dt.day_name()
    df["accept_hour"] = df.accept_time.dt.hour
    
    print("Step - 7 Completed !!!")
    
    if typee == "test":
        df["accept_day"].fillna("Sunday", inplace = True)
        df["accept_hour"].fillna(2, inplace = True)
    
    ordr_id = df.order_id
    
    df["order_day"] = df["order_day"].astype("category")
    df["order_dayparts"] = df["order_dayparts"].astype("category")
    df["allot_dayparts"] = df["allot_dayparts"].astype("category")
    df["accept_day"] = df["accept_day"].astype("category")
    
    print("Step - 8 Completed !!!")
    
    cols_to_rem = [
        "reassignment_method", "reassignment_reason", "reassigned_order", 
        "order_time", "order_date", "allot_time", "accept_time", "order_id"
    ]
    
    if typee == "train":
        cols_to_rem += ['pickup_time']
    
    df.drop(columns = cols_to_rem, inplace = True)
    
    print("Step - 9 Completed !!!")
    
    print("!!! DONE !!!")
    
    return df, ordr_id

In [None]:
df_train, train_order_id = preprocess(df_train, "train")

In [None]:
df_test, test_order_id = preprocess(df_test, "test")

In [None]:
print("Train : ", df_train.shape)
print("Test : ", df_test.shape)

In [None]:
df_train.columns

In [None]:
df_test.columns

In [None]:
df_train.isna().sum()

In [None]:
df_test.isna().sum()

In [None]:
df_train.dtypes

In [None]:
df_test.dtypes

In [None]:
cat_features = [
    "allot_hour", "order_day", "order_dayparts", 
    "allot_dayparts", "is_weekend", "accept_day",
    "accept_hour", "order_month", "order_hour",
]

num_features = [
    'first_mile_distance', 'last_mile_distance',
    'alloted_orders', 'delivered_orders', 'undelivered_orders', 
    'lifetime_order_count', 'session_time', 'TD_1_mins', 'TD_2_mins',
    'order_number_for_this_day'
]

## Modelling

In [None]:
y = df_train.cancelled
X = df_train.drop('cancelled', axis = 1)

In [None]:
print(y.shape)
print(X.shape)

In [None]:
print(df_test.shape)

In [None]:
rs = RobustScaler()
X[num_features] = rs.fit_transform(X[num_features])
df_test[num_features] = rs.transform(df_test[num_features])

### XGBoost

In [None]:
kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = seed)

test_preds_xgb = []

for fold, (trn_ind, val_ind) in enumerate(kfold.split(X, y)):
    print(f"=====================fold: {fold + 1}=====================")
    
    X_train, y_train = X.iloc[trn_ind], y.iloc[trn_ind]
    X_valid, y_valid = X.iloc[val_ind], y.iloc[val_ind]
    
    model = XGBClassifier(
        n_estimators = 800, eval_metric = 'auc', tree_method="gpu_hist",
        enable_categorical = True, use_label_encoder = False
    )
    
    model.fit(X_train, y_train, early_stopping_rounds = 150, eval_set = [(X_valid, y_valid)], verbose = 100)
    
    preds_valid = model.predict_proba(X_valid)
    
    roc = roc_auc_score(y_valid, preds_valid[:,1])
    
    test_pred = model.predict_proba(df_test)[:,1]
    test_preds_xgb.append(test_pred)
    
    print(F'fold {fold + 1}: ROC AUC {roc}')

In [None]:
test_preds_xgb

In [None]:
test_preds_xgb = np.array(test_preds_xgb)

final_preds_xgb = np.mean(test_preds_xgb, axis = 0)

In [None]:
final_preds_xgb

In [None]:
sample_sub.head()

In [None]:
sample_sub["cancelled"] = final_preds_xgb

In [None]:
sample_sub.to_csv('final-xgb-2.csv', index = False)

### LGBM

In [None]:
kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = seed)

test_preds_lgb = []

for fold, (trn_ind, val_ind) in enumerate(kfold.split(X, y)):
    print(f"=====================fold: {fold + 1}=====================")
    
    X_train, y_train = X.iloc[trn_ind], y.iloc[trn_ind]
    X_valid, y_valid = X.iloc[val_ind], y.iloc[val_ind]
    
#     print(y_train.value_counts())
    
    train_dataset = lgb.Dataset(X_train, y_train, categorical_feature = cat_features)
    valid_dataset = lgb.Dataset(X_valid, y_valid, categorical_feature = cat_features)
    
    params ={
        'learning_rate': 0.01,
        "objective": "binary",
        "metric": "auc",
        'boosting_type': "gbdt",
        'verbosity': -1,
        'n_jobs': -1, 
        'seed': 42,
        'n_estimators': 1500
    }
    
    model = lgb.train(
            params,
            train_set = train_dataset, 
            valid_sets = [valid_dataset], 
            verbose_eval = 100,
            early_stopping_rounds = 100
    )
    
    preds_valid = model.predict(X_valid)
    
    roc = roc_auc_score(y_valid, preds_valid)
    
    test_pred = model.predict(df_test)
    test_preds_lgb.append(test_pred)
    
    print(F'fold {fold + 1}: ROC AUC {roc}')

In [None]:
test_preds_lgb

In [None]:
test_preds_lgb = np.array(test_preds_lgb)

final_preds_lgb = np.mean(test_preds_lgb, axis = 0)

In [None]:
sample_sub["cancelled"] = final_preds_lgb

In [None]:
sample_sub.to_csv('final-lgbm-2.csv', index = False)

### CatBoost

In [None]:
kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = seed)

test_preds_catboost = []
mean_roc = 0

for fold, (trn_ind, val_ind) in enumerate(kfold.split(X, y)):
    print(f"=====================fold: {fold + 1}=====================")
    
    X_train, y_train = X.iloc[trn_ind], y.iloc[trn_ind]
    X_valid, y_valid = X.iloc[val_ind], y.iloc[val_ind]
    
    params = {
        'loss_function' : 'Logloss',
        'iterations' : 10000,
        'eval_metric' : 'AUC',
        'task_type' : 'GPU',
        'learning_rate' : 0.01,
        'verbose' : 1000,
        'random_seed' : seed,
        'custom_metric' : 'AUC:hints=skip_train~false'
    }
    
    model = CatBoostClassifier(
        **params
    )
    
    model.fit(
        X_train, y_train,
        cat_features = cat_features,
        eval_set=(X_valid, y_valid),
        use_best_model = True,
        verbose = 1000
    )
    
    preds_valid = model.predict_proba(X_valid)
    
    roc = roc_auc_score(y_valid, preds_valid[:,1])
    mean_roc = mean_roc + roc
    
    test_pred = model.predict_proba(df_test)[:,1]
    test_preds_catboost.append(test_pred)
    
    print(F'fold {fold + 1}: ROC AUC {roc}')
    
mean_roc = mean_roc / 10

print()
print("Average ROC over 10 folds : ", mean_roc)

In [None]:
test_preds_catboost

In [None]:
test_preds_catboost = np.array(test_preds_catboost)

final_preds_catboost = np.mean(test_preds_catboost, axis = 0)

In [None]:
sample_sub["cancelled"] = final_preds_catboost

In [None]:
sample_sub.to_csv('final-catboost-2.csv', index = False)

### CatBoost + LGBM + XGBoost

In [None]:
final_preds_combined = 0.5 * final_preds_catboost + 0.4 * final_preds_lgb + 0.1 * final_preds_xgb

In [None]:
sample_sub["cancelled"] = final_preds_combined

In [None]:
sample_sub.to_csv('final-cb-lgbm-xgb-2.csv', index = False)

### CatBoost + LGBM - I

In [None]:
final_preds_cb_lgb = 0.5 * final_preds_catboost + 0.5 * final_preds_lgb

In [None]:
sample_sub["cancelled"] = final_preds_cb_lgb

In [None]:
sample_sub.to_csv('final-cb-lgbm-2.csv', index = False)

### CatBoost + LGBM - II

In [None]:
final_preds_cb_lgb_2 = 0.6 * final_preds_catboost + 0.4 * final_preds_lgb

In [None]:
sample_sub["cancelled"] = final_preds_cb_lgb_2

In [None]:
sample_sub.to_csv('final-cb-lgbm-3.csv', index = False)

### Final Submission CSV
#### 1. final-catboost-2.csv (Best Scored CSV that was selected) 
#####   - Public Score - 0.77579 
#####   - Private Score - 0.79880
#### 2. final-cb-lgbm-3.csv or final-cb-lgbm-2.csv