In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
import matplotlib.patches as mpatches


from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lazypredict.Supervised import LazyClassifier

from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report, balanced_accuracy_score
from sklearn.pipeline import make_pipeline

import time

import warnings
warnings.filterwarnings('ignore')
# Read the data from CSV file
data = pd.read_csv("train.csv")

# Get basic information about the data
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80692 entries, 0 to 80691
Data columns (total 26 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   passenger_unique_identifier                      80692 non-null  float64
 1   driver_unique_identifier                         71166 non-null  float64
 2   approximate_distance_meter                       80692 non-null  int64  
 3   final_price                                      80692 non-null  float64
 4   waiting_time_enabled                             80692 non-null  int64  
 5   second_destination_final_price                   80692 non-null  float64
 6   round_ride_final_price                           80692 non-null  float64
 7   for_friend_enabled                               80692 non-null  int64  
 8   is_voucher_used                                  0 non-null      float64
 9   intercity                   

## Find Columns with Missing Values in Data

In [2]:
# Check for missing values
print("Missing values summary:")
print(data.isnull().sum())

Missing values summary:
passenger_unique_identifier                            0
driver_unique_identifier                            9526
approximate_distance_meter                             0
final_price                                            0
waiting_time_enabled                                   0
second_destination_final_price                         0
round_ride_final_price                                 0
for_friend_enabled                                     0
is_voucher_used                                    80692
intercity                                              0
request_datetime                                       0
origin_latitude                                        0
origin_longitude                                       0
destination_latitude                                   0
destination_longitude                                  0
second_destination_latitude                            0
second_destination_longitude                           0
request

In [3]:
(data.isnull().sum()/(len(data)))*100

passenger_unique_identifier                         0.00
driver_unique_identifier                           11.81
approximate_distance_meter                          0.00
final_price                                         0.00
waiting_time_enabled                                0.00
second_destination_final_price                      0.00
round_ride_final_price                              0.00
for_friend_enabled                                  0.00
is_voucher_used                                   100.00
intercity                                           0.00
request_datetime                                    0.00
origin_latitude                                     0.00
origin_longitude                                    0.00
destination_latitude                                0.00
destination_longitude                               0.00
second_destination_latitude                         0.00
second_destination_longitude                        0.00
requested_service_type         

In [4]:
data.treatment_group.value_counts()

treatment_group
A    70386
B      780
Name: count, dtype: int64

In [5]:
data.loc[data.treatment_group == "A", "ride (target)"].value_counts()

ride (target)
1    56589
0    13797
Name: count, dtype: int64

In [6]:
data['treatment_group'].fillna(data['treatment_group'].mode()[0], inplace=True)

## Find Duplicate Records in Data

In [7]:
data.duplicated().sum()

0

## Rename Target Columns in Data for more Convenient Data Manipulation

In [8]:
data = data.rename(columns = {"ride (target)":"target"})

## Drop Useless Columns

In [9]:
data = data.drop(["('new_origin_latitude', 'new_origin_longitude')", "is_voucher_used", 
                                                  "driver_unique_identifier", "passenger_unique_identifier"], axis="columns")

## Drop Rows with Missing Data

In [10]:
data = data.dropna()

In [11]:
data['request_datetime'] = pd.to_datetime(data['request_datetime'])

## Convert Column Containing Date to DateTime

In [12]:
# Analyze data types
print("Data types:")
print(data.dtypes)

Data types:
approximate_distance_meter                                int64
final_price                                             float64
waiting_time_enabled                                      int64
second_destination_final_price                          float64
round_ride_final_price                                  float64
for_friend_enabled                                        int64
intercity                                                 int64
request_datetime                      datetime64[ns, UTC+03:30]
origin_latitude                                         float64
origin_longitude                                        float64
destination_latitude                                    float64
destination_longitude                                   float64
second_destination_latitude                             float64
second_destination_longitude                            float64
requested_service_type                                   object
days_since_passenger_first_r

## Investigate Unique Values of Features to Decide if they should be converted to Categorical Type

In [13]:
data.nunique()

approximate_distance_meter            14341
final_price                             430
waiting_time_enabled                      2
second_destination_final_price         1878
round_ride_final_price                  706
for_friend_enabled                        2
intercity                                 2
request_datetime                      75692
origin_latitude                        8479
origin_longitude                       8479
destination_latitude                   8479
destination_longitude                  8479
second_destination_latitude            8479
second_destination_longitude           8479
requested_service_type                    5
days_since_passenger_first_ride        2394
days_since_passenger_first_request     2405
in_hurry_enabled                          4
target                                    2
treatment_group                           2
new_origin_latitude                   80692
new_origin_longitude                  80692
dtype: int64

## Binarize the featuer "in_hurry_enabled"

In [14]:
data.in_hurry_enabled.unique()

array([0, 1, 2, 3], dtype=int64)

In [15]:
encode={'in_hurry_enabled':{0:0, 1:1, 2:1, 3:1}}
data = data.replace(encode)

In [16]:
data.nunique()

approximate_distance_meter            14341
final_price                             430
waiting_time_enabled                      2
second_destination_final_price         1878
round_ride_final_price                  706
for_friend_enabled                        2
intercity                                 2
request_datetime                      75692
origin_latitude                        8479
origin_longitude                       8479
destination_latitude                   8479
destination_longitude                  8479
second_destination_latitude            8479
second_destination_longitude           8479
requested_service_type                    5
days_since_passenger_first_ride        2394
days_since_passenger_first_request     2405
in_hurry_enabled                          2
target                                    2
treatment_group                           2
new_origin_latitude                   80692
new_origin_longitude                  80692
dtype: int64

## Compute Price per Meter

In [17]:
data["price_per_meter"] = data.apply(lambda df: 0 if df["approximate_distance_meter"] == 0 
                                                                                 else df["final_price"]/df["approximate_distance_meter"], axis=1)

## Split the Data into Train and Validation

In [18]:
y = data.target
data.drop(['target'], axis=1, inplace=True)

X_train, X_val, y_train, y_val = train_test_split(data, y, stratify = y, test_size=.2, random_state =10)

## Extract Useful Info from Date Column

In [19]:
def date_info_extractor(data, column):
    data.loc[:, 'request_hour'] = data[column].dt.hour
    data.loc[:, 'request_weekday'] = data[column].dt.day_name()
    data.loc[:, 'request_date'] = data[column].dt.day

    return data

X_train = date_info_extractor(X_train.copy(), "request_datetime")
X_val = date_info_extractor(X_val.copy(), "request_datetime")

In [20]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64553 entries, 22079 to 1544
Data columns (total 25 columns):
 #   Column                              Non-Null Count  Dtype                    
---  ------                              --------------  -----                    
 0   approximate_distance_meter          64553 non-null  int64                    
 1   final_price                         64553 non-null  float64                  
 2   waiting_time_enabled                64553 non-null  int64                    
 3   second_destination_final_price      64553 non-null  float64                  
 4   round_ride_final_price              64553 non-null  float64                  
 5   for_friend_enabled                  64553 non-null  int64                    
 6   intercity                           64553 non-null  int64                    
 7   request_datetime                    64553 non-null  datetime64[ns, UTC+03:30]
 8   origin_latitude                     64553 non-null  float6

In [21]:
X_train.nunique()

approximate_distance_meter            13346
final_price                             398
waiting_time_enabled                      2
second_destination_final_price         1586
round_ride_final_price                  573
for_friend_enabled                        2
intercity                                 2
request_datetime                      61270
origin_latitude                        7330
origin_longitude                       7330
destination_latitude                   7330
destination_longitude                  7330
second_destination_latitude            7330
second_destination_longitude           7330
requested_service_type                    5
days_since_passenger_first_ride        2375
days_since_passenger_first_request     2389
in_hurry_enabled                          2
treatment_group                           2
new_origin_latitude                   64553
new_origin_longitude                  64553
price_per_meter                       51603
request_hour                    

## Find Columns with Duplicate Values in Other Columns and Dropping them

In [22]:
data = data.drop(["origin_longitude", "destination_longitude", "second_destination_longitude", 
                  "origin_latitude", "destination_latitude", "second_destination_latitude"], axis="columns")

In [23]:
X_train = X_train.drop(["origin_longitude", "destination_longitude", "second_destination_longitude", 
                  "origin_latitude", "destination_latitude", "second_destination_latitude"], axis="columns")

X_val = X_val.drop(["origin_longitude", "destination_longitude", "second_destination_longitude", 
                  "origin_latitude", "destination_latitude", "second_destination_latitude"], axis="columns")

In [24]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64553 entries, 22079 to 1544
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype                    
---  ------                              --------------  -----                    
 0   approximate_distance_meter          64553 non-null  int64                    
 1   final_price                         64553 non-null  float64                  
 2   waiting_time_enabled                64553 non-null  int64                    
 3   second_destination_final_price      64553 non-null  float64                  
 4   round_ride_final_price              64553 non-null  float64                  
 5   for_friend_enabled                  64553 non-null  int64                    
 6   intercity                           64553 non-null  int64                    
 7   request_datetime                    64553 non-null  datetime64[ns, UTC+03:30]
 8   requested_service_type              64553 non-null  object

In [25]:
X_train.nunique()

approximate_distance_meter            13346
final_price                             398
waiting_time_enabled                      2
second_destination_final_price         1586
round_ride_final_price                  573
for_friend_enabled                        2
intercity                                 2
request_datetime                      61270
requested_service_type                    5
days_since_passenger_first_ride        2375
days_since_passenger_first_request     2389
in_hurry_enabled                          2
treatment_group                           2
new_origin_latitude                   64553
new_origin_longitude                  64553
price_per_meter                       51603
request_hour                             24
request_weekday                           7
request_date                             10
dtype: int64

## Covert some Features to Categorical

In [26]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 25 and 
                        X_train[cname].dtype in ["int64", "object","int32"]]

In [27]:
from sklearn.preprocessing import LabelEncoder

def one_hot_encode(X_train, X_val, low_cardinality_cols):
    # Use as many lines of code as you need!
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

    OH_X_train_cols = pd.DataFrame(ohe.fit_transform(X_train[low_cardinality_cols]))
    OH_X_valid_cols = pd.DataFrame(ohe.transform(X_val[low_cardinality_cols]))
    
    OH_X_train_cols.index = X_train.index
    OH_X_valid_cols.index = X_val.index
    
    X_train_num_cols = X_train.drop(low_cardinality_cols, axis = 1)
    X_valid_num_cols = X_val.drop(low_cardinality_cols, axis = 1)
    
    OH_X_train = pd.concat([OH_X_train_cols, X_train_num_cols], axis = 1) # Your code here
    OH_X_valid = pd.concat([OH_X_valid_cols, X_valid_num_cols], axis = 1) # Your code here
    
    OH_X_train.columns = OH_X_train.columns.astype(str)
    OH_X_valid.columns = OH_X_valid.columns.astype(str)

    return OH_X_train.copy(), OH_X_valid.copy()
    
# X_train, X_val = one_hot_encode(X_train, X_val, low_cardinality_cols)

le = LabelEncoder()

for col in low_cardinality_cols:
    X_train[col] = le.fit_transform(X_train[col])
    X_val[col] = le.transform(X_val[col])

In [28]:
X_train.head()

Unnamed: 0,approximate_distance_meter,final_price,waiting_time_enabled,second_destination_final_price,round_ride_final_price,for_friend_enabled,intercity,request_datetime,requested_service_type,days_since_passenger_first_ride,days_since_passenger_first_request,in_hurry_enabled,treatment_group,new_origin_latitude,new_origin_longitude,price_per_meter,request_hour,request_weekday,request_date
22079,8814,1880.0,0,0.0,0.0,0,0,2012-11-15 21:52:35+03:30,0,518,518,0,0,29.5,60.94,0.21,21,4,3
46684,4882,960.0,0,0.0,0.0,0,0,2012-11-14 16:09:09+03:30,0,977,977,0,0,29.46,60.98,0.2,16,6,2
48809,1268,700.0,0,0.0,0.0,0,0,2012-11-17 22:48:08+03:30,0,556,556,0,0,29.52,60.94,0.55,22,2,5
35402,3433,1140.0,0,0.0,0.0,0,0,2012-11-21 23:50:01+03:30,0,211,284,0,0,29.51,60.97,0.33,23,6,9
25827,4569,1060.0,0,0.0,0.0,0,0,2012-11-20 10:02:49+03:30,0,1403,1403,0,0,29.5,60.95,0.23,10,5,8


In [29]:
X_val.head()

Unnamed: 0,approximate_distance_meter,final_price,waiting_time_enabled,second_destination_final_price,round_ride_final_price,for_friend_enabled,intercity,request_datetime,requested_service_type,days_since_passenger_first_ride,days_since_passenger_first_request,in_hurry_enabled,treatment_group,new_origin_latitude,new_origin_longitude,price_per_meter,request_hour,request_weekday,request_date
69957,9260,1960.0,0,0.0,0.0,0,0,2012-11-13 16:06:46+03:30,0,364,364,1,0,29.46,60.92,0.21,16,5,1
63692,4217,1380.0,0,0.0,0.0,0,0,2012-11-18 18:03:58+03:30,0,219,230,0,0,29.53,60.96,0.33,18,3,6
1987,4699,1380.0,0,0.0,0.0,0,0,2012-11-12 21:01:08+03:30,0,207,207,0,0,29.52,60.96,0.29,21,1,0
50486,12673,2420.0,0,1339.33,0.0,0,0,2012-11-15 06:44:52+03:30,0,1244,1254,0,0,29.5,60.92,0.19,6,4,3
15756,3413,1180.0,0,0.0,0.0,0,0,2012-11-15 13:12:45+03:30,0,935,935,1,0,29.52,60.93,0.35,13,4,3


## Investigate the Range, Mean, and Standard Deviation of Numerical Features

In [30]:
# Descriptive statistics for numerical features
print("Descriptive statistics:")
X_train.loc[:,X_train.nunique()>24].describe().T

Descriptive statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
approximate_distance_meter,64553.0,5709.87,8497.37,0.0,2976.0,4662.0,6611.0,796911.0
final_price,64553.0,1308.0,795.56,200.0,960.0,1200.0,1480.0,64520.0
second_destination_final_price,64553.0,28.75,183.41,0.0,0.0,0.0,0.0,4829.41
round_ride_final_price,64553.0,9.81,105.76,0.0,0.0,0.0,0.0,3980.86
days_since_passenger_first_ride,64553.0,717.42,544.84,-1.0,279.0,601.0,1114.0,2517.0
days_since_passenger_first_request,64553.0,742.27,546.91,0.0,300.0,632.0,1143.0,2658.0
new_origin_latitude,64553.0,30.1,1.88,25.28,29.49,29.51,29.52,39.68
new_origin_longitude,64553.0,59.92,3.05,44.79,60.93,60.95,60.96,61.26
price_per_meter,64553.0,0.41,6.78,0.0,0.21,0.27,0.35,880.0


## Scaling the Numerical Features

In [31]:
X_train.drop(["request_datetime"], axis=1, inplace=True)
X_val.drop(["request_datetime"], axis=1, inplace=True)

In [32]:
uscaled_X_train = X_train.copy()
uscaled_X_val = X_val.copy()

In [33]:
uscaled_X_train.columns

Index(['approximate_distance_meter', 'final_price', 'waiting_time_enabled',
       'second_destination_final_price', 'round_ride_final_price',
       'for_friend_enabled', 'intercity', 'requested_service_type',
       'days_since_passenger_first_ride', 'days_since_passenger_first_request',
       'in_hurry_enabled', 'treatment_group', 'new_origin_latitude',
       'new_origin_longitude', 'price_per_meter', 'request_hour',
       'request_weekday', 'request_date'],
      dtype='object')

In [34]:
print("Descriptive statistics:")
uscaled_X_train.loc[:,uscaled_X_train.nunique()>24].describe().T

Descriptive statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
approximate_distance_meter,64553.0,5709.87,8497.37,0.0,2976.0,4662.0,6611.0,796911.0
final_price,64553.0,1308.0,795.56,200.0,960.0,1200.0,1480.0,64520.0
second_destination_final_price,64553.0,28.75,183.41,0.0,0.0,0.0,0.0,4829.41
round_ride_final_price,64553.0,9.81,105.76,0.0,0.0,0.0,0.0,3980.86
days_since_passenger_first_ride,64553.0,717.42,544.84,-1.0,279.0,601.0,1114.0,2517.0
days_since_passenger_first_request,64553.0,742.27,546.91,0.0,300.0,632.0,1143.0,2658.0
new_origin_latitude,64553.0,30.1,1.88,25.28,29.49,29.51,29.52,39.68
new_origin_longitude,64553.0,59.92,3.05,44.79,60.93,60.95,60.96,61.26
price_per_meter,64553.0,0.41,6.78,0.0,0.21,0.27,0.35,880.0


In [35]:
rob_scaler = RobustScaler()

standardable_feats = [ 'approximate_distance_meter', 'final_price', 'second_destination_final_price', 'round_ride_final_price',
                      'days_since_passenger_first_ride', 'days_since_passenger_first_request']

for col_name in standardable_feats: 
    X_train["scaled_"+col_name] = rob_scaler.fit_transform(X_train[col_name].values.reshape(-1,1))
    X_val["scaled_"+col_name] = rob_scaler.transform(X_val[col_name].values.reshape(-1,1))

X_train.drop(standardable_feats, axis=1, inplace=True)
X_val.drop(standardable_feats, axis=1, inplace=True)

In [36]:
print("Descriptive statistics:")
X_train.info()

Descriptive statistics:
<class 'pandas.core.frame.DataFrame'>
Index: 64553 entries, 22079 to 1544
Data columns (total 18 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   waiting_time_enabled                       64553 non-null  int64  
 1   for_friend_enabled                         64553 non-null  int64  
 2   intercity                                  64553 non-null  int64  
 3   requested_service_type                     64553 non-null  int32  
 4   in_hurry_enabled                           64553 non-null  int64  
 5   treatment_group                            64553 non-null  int32  
 6   new_origin_latitude                        64553 non-null  float64
 7   new_origin_longitude                       64553 non-null  float64
 8   price_per_meter                            64553 non-null  float64
 9   request_hour                               64553 non-null  int64  
 10  

## Fitting Models on Imbalanced Data

In [None]:
# imbalanced_data = data.copy()

# # Label encoding for categoricals
# for colname in imbalanced_data.select_dtypes("object"):
#     imbalanced_data[colname], _ = imbalanced_data[colname].factorize()

# imbalanced_X = imbalanced_data.drop(['target', 'request_datetime'], axis=1)
# imbalanced_y = imbalanced_data['target']

# X_train, X_test, y_train, y_test = train_test_split(imbalanced_X, imbalanced_y, stratify = imbalanced_y, test_size=.25, random_state =123)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train.to_numpy(), X_val.to_numpy(), y_train, y_val)

print(models)

In [None]:
# xgb1 = XGBClassifier(
#  learning_rate =0.1,
#  n_estimators=1000,
#  max_depth=5,
#  min_child_weight=1,
#  gamma=0,
#  subsample=0.8,
#  colsample_bytree=0.8,
#  objective= 'binary:logistic',
#  nthread=4,
#  scale_pos_weight=1,
#  seed=27)

# We will undersample during cross validating
sss = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)

# undersampled_data = data.copy()
imbalanced_X_train = X_train.copy()
imbalanced_X_val = X_val.copy()

# Label encoding for categoricals
# for colname in undersampled_data.select_dtypes("object"):
#     undersampled_data[colname], _ = undersampled_data[colname].factorize()

# undersample_X = undersampled_data.drop(['target', 'request_datetime'], axis=1)
# undersample_y = undersampled_data['target']

# for train_index, test_index in sss.split(undersample_X, undersample_y):
#     print("Train:", train_index, "Test:", test_index)
#     undersample_Xtrain, undersample_Xtest = undersample_X.iloc[train_index], undersample_X.iloc[test_index]
#     undersample_ytrain, undersample_ytest = undersample_y.iloc[train_index], undersample_y.iloc[test_index]

imbalanced_X_train = imbalanced_X_train.values
imbalanced_X_val = imbalanced_X_val.values
imbalanced_y_train = y_train.values
imbalanced_y_val = y_val.values

imbalanced_accuracy = []
imbalanced_precision = []
imbalanced_recall = []
imbalanced_f1 = []
imbalanced_auc = []
imbalanced_balanced_accuracy = []

imbalanced_val_accuracy = []
imbalanced_val_precision = []
imbalanced_val_recall = []
imbalanced_val_f1 = []
imbalanced_val_auc = []
imbalanced_val_balanced_accuracy = []

xgb_params = {
 'learning_rate' :[1e-3, 1e-2, 0.1, 0.2, 0.3],
 'n_estimators' : [10, 100, 1000, 10000],
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2),
 'gamma':[i/10.0 for i in range(0,5)],
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)],
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]      
 }

rand_xgb = RandomizedSearchCV(XGBClassifier(objective= 'binary:logistic'), xgb_params, n_iter=4)

# Implementing NearMiss Technique
# Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables)
# X_nearmiss, y_nearmiss = NearMiss().fit_resample(undersample_X.values, undersample_y.values)
# print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss)))
# Cross Validating the right way

for train, test in sss.split(imbalanced_X_train, imbalanced_y_train):
    # undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), XGBClassifier(objective= 'binary:logistic')) # SMOTE happens during Cross Validation not before..
    # undersample_model = undersample_pipeline.fit(undersample_X[train], undersample_y[train])
    # undersample_prediction = undersample_model.predict(undersample_X[test])

    imbalanced_pipeline = make_pipeline(rand_xgb) # SMOTE happens during Cross Validation not before..
    imbalanced_model = imbalanced_pipeline.fit(imbalanced_X_train[train], imbalanced_y_train[train])
    best_est = rand_xgb.best_estimator_
    
    imbalanced_prediction = imbalanced_model.predict(imbalanced_X_train[test])
    # imbalanced_accuracy.append(imbalanced_pipeline.score(imbalanced_X_train[test], imbalanced_y_train[test]))
    imbalanced_accuracy.append(accuracy_score(imbalanced_y_train[test], imbalanced_prediction))
    imbalanced_precision.append(precision_score(imbalanced_y_train[test], imbalanced_prediction))
    imbalanced_recall.append(recall_score(imbalanced_y_train[test], imbalanced_prediction))
    imbalanced_f1.append(f1_score(imbalanced_y_train[test], imbalanced_prediction))
    imbalanced_auc.append(roc_auc_score(imbalanced_y_train[test], imbalanced_model.predict_proba(imbalanced_X_train[test])[:, 1]))
    imbalanced_balanced_accuracy.append(balanced_accuracy_score(imbalanced_y_train[test], imbalanced_prediction))

    imbalanced_val_prediction = imbalanced_model.predict(imbalanced_X_val)
    # imbalanced_val_accuracy.append(imbalanced_pipeline.score(imbalanced_X_val, imbalanced_y_val))
    imbalanced_val_accuracy.append(accuracy_score(imbalanced_y_val, imbalanced_val_prediction))
    imbalanced_val_precision.append(precision_score(imbalanced_y_val, imbalanced_val_prediction))
    imbalanced_val_recall.append(recall_score(imbalanced_y_val, imbalanced_val_prediction))
    imbalanced_val_f1.append(f1_score(imbalanced_y_val, imbalanced_val_prediction))
    imbalanced_val_auc.append(roc_auc_score(imbalanced_y_val, imbalanced_model.predict_proba(imbalanced_X_val)[:, 1]))
    imbalanced_val_balanced_accuracy.append(balanced_accuracy_score(imbalanced_y_val, imbalanced_val_prediction))
    


print("Training Data Accuracy Score: {:.2f}".format(np.mean(imbalanced_accuracy)))
print("Training Data Precision Score: {:.2f}".format(np.mean(imbalanced_precision)))
print("Training Data Recall Score: {:.2f}".format(np.mean(imbalanced_recall)))
print("Training Data F1 Score: {:.2f}".format(np.mean(imbalanced_f1)))
print("Training Data ROC AUC: {:.2f}".format(np.mean(imbalanced_auc)))
print("Training Data Balanced Accuracy Score: {:.2f}".format(np.mean(imbalanced_balanced_accuracy)))
print("*"*100)
print("Validation Data Accuracy Score: {:.2f}".format(np.mean(imbalanced_val_accuracy)))
print("Validation Data Precision Score: {:.2f}".format(np.mean(imbalanced_val_precision)))
print("Validation Data Recall Score: {:.2f}".format(np.mean(imbalanced_val_recall)))
print("Validation Data F1 Score: {:.2f}".format(np.mean(imbalanced_val_f1)))
print("Validation Data ROC AUC: {:.2f}".format(np.mean(imbalanced_val_auc)))
print("Validation Data Balanced Accuracy Score: {:.2f}".format(np.mean(imbalanced_val_balanced_accuracy)))

In [None]:
def modelfit(alg, xtrain, ytrain, X_val, y_val, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(xtrain, label=ytrain)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    # alg.fit(xtrain, ytrain,eval_metric='auc')
    alg.fit(xtrain, ytrain)

    #Predict training set:
    dtrain_predictions = alg.predict(xtrain)
    dtrain_predprob = alg.predict_proba(xtrain)[:,1]

    #Predict training set:
    dvalid_predictions = alg.predict(X_val)
    dvalid_predprob = alg.predict_proba(X_val)[:,1]

    def report_generate(ytrain, dtrain_predictions, dtrain_predprob, split_type):
        #Print model report:
        print("\nModel Report on "+split_type)
        print("Accuracy : %.4g" % accuracy_score(ytrain, dtrain_predictions))
        print("F1 Score : %.4g" % f1_score(ytrain, dtrain_predictions))
        print("Precision Score : %.4g" % precision_score(ytrain, dtrain_predictions))
        print("Recall Score : %.4g" % recall_score(ytrain, dtrain_predictions))
        print("ROC AUC Score (Train): %f" % roc_auc_score(ytrain, dtrain_predprob))
        print("Balanced Accuracy Score (Train): %f" % balanced_accuracy_score(ytrain, dtrain_predictions))


    report_generate(ytrain, dtrain_predictions, dtrain_predprob, "Training Data")
    report_generate(y_val, dvalid_predictions, dvalid_predprob, "Validation Data")

    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
# New_df is from the random undersample data (fewer instances)
# , 'request_datetime', 
#                     'passenger_unique_identifier', 'driver_unique_identifier'
# X_train = data.drop(['target','request_datetime'], axis=1)
# y_train = data['target']

modelfit(xgb1, X_train.to_numpy(), y_train, X_val.to_numpy(), y_val)

In [None]:
# USING UNSCALED DATA
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
# New_df is from the random undersample data (fewer instances)
# , 'request_datetime', 
#                     'passenger_unique_identifier', 'driver_unique_identifier'

# temp_unscaled_data = uscaled_data.copy()

# # Label encoding for categoricals
# for colname in temp_unscaled_data.select_dtypes("object"):
#     temp_unscaled_data[colname], _ = temp_unscaled_data[colname].factorize()

# # All discrete features should now have integer dtypes (double-check this before using MI!)
# discrete_features = temp_unscaled_data.dtypes == int
# X_train = temp_unscaled_data.drop(['target','request_datetime'], axis=1)
# y_train = temp_unscaled_data['target']

# uscaled_X_train = uscaled_X_train.drop(['request_datetime'], axis=1)
# uscaled_X_val = uscaled_X_val.drop(['request_datetime'], axis=1)

# uscaled_X_train.drop(["request_datetime"], axis=1, inplace=True)
# uscaled_X_val.drop(["request_datetime"], axis=1, inplace=True)

modelfit(xgb1, uscaled_X_train.to_numpy(), y_train, uscaled_X_val.to_numpy(), y_val)

## Use Undersampling to Deal with Imbalanced Dataset

In [None]:
undersampled_data = data.copy()

# # Label encoding for categoricals
# for colname in undersampled_data.select_dtypes("object"):
#     undersampled_data[colname], _ = undersampled_data[colname].factorize()

# undersample_X = undersampled_data.drop(['target', 'request_datetime'], axis=1)
# undersample_y = undersampled_data['target']

# for train_index, test_index in sss.split(undersample_X, undersample_y):
#     print("Train:", train_index, "Test:", test_index)
#     undersample_Xtrain, undersample_Xtest = undersample_X.iloc[train_index], undersample_X.iloc[test_index]
#     undersample_ytrain, undersample_ytest = undersample_y.iloc[train_index], undersample_y.iloc[test_index]

# Implementing NearMiss Technique
# Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables)
nm = NearMiss(sampling_strategy='majority')
X_train_nearmiss, y_train_nearmiss = nm.fit_resample(X_train.values, y_train.values)
X_val_nearmiss, y_val_nearmiss = nm.fit_resample(X_val.values, y_val.values)

# print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss)))
# Cross Validating the right way
# X_train, X_test, y_train, y_test = train_test_split(X_nearmiss, y_nearmiss, stratify = y_nearmiss, test_size=.25, random_state =123)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train_nearmiss, X_val_nearmiss, y_train_nearmiss, y_val_nearmiss)

print(models)

In [None]:
# xgb1 = XGBClassifier(
#  learning_rate =0.1,
#  n_estimators=1000,
#  max_depth=5,
#  min_child_weight=1,
#  gamma=0,
#  subsample=0.8,
#  colsample_bytree=0.8,
#  objective= 'binary:logistic',
#  nthread=4,
#  scale_pos_weight=1,
#  seed=27)

# We will undersample during cross validating
# sss = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)

# undersampled_data = data.copy()
undersampled_X_train = X_train.copy()
undersampled_X_val = X_val.copy()

# Label encoding for categoricals
# for colname in undersampled_data.select_dtypes("object"):
#     undersampled_data[colname], _ = undersampled_data[colname].factorize()

# undersample_X = undersampled_data.drop(['target', 'request_datetime'], axis=1)
# undersample_y = undersampled_data['target']

# for train_index, test_index in sss.split(undersample_X, undersample_y):
#     print("Train:", train_index, "Test:", test_index)
#     undersample_Xtrain, undersample_Xtest = undersample_X.iloc[train_index], undersample_X.iloc[test_index]
#     undersample_ytrain, undersample_ytest = undersample_y.iloc[train_index], undersample_y.iloc[test_index]

undersampled_X_train = undersampled_X_train.values
undersampled_X_val = undersampled_X_val.values
undersample_y_train = y_train.values
undersample_y_val = y_val.values

undersample_accuracy = []
undersample_precision = []
undersample_recall = []
undersample_f1 = []
undersample_auc = []
undersample_balanced_accuracy = []

undersample_val_accuracy = []
undersample_val_precision = []
undersample_val_recall = []
undersample_val_f1 = []
undersample_val_auc = []
undersample_val_balanced_accuracy = []

xgb_params = {
 'learning_rate' :[1e-3, 1e-2, 0.1, 0.2, 0.3],
 'n_estimators' : [10, 100, 1000, 10000],
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2),
 'gamma':[i/10.0 for i in range(0,5)],
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)],
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]      
 }

rand_xgb = RandomizedSearchCV(XGBClassifier(objective= 'binary:logistic'), xgb_params, n_iter=4)

# Implementing NearMiss Technique
# Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables)
# X_nearmiss, y_nearmiss = NearMiss().fit_resample(undersample_X.values, undersample_y.values)
# print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss)))
# Cross Validating the right way

for train, test in sss.split(undersampled_X_train, undersample_y_train):
    # undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), XGBClassifier(objective= 'binary:logistic')) # SMOTE happens during Cross Validation not before..
    # undersample_model = undersample_pipeline.fit(undersample_X[train], undersample_y[train])
    # undersample_prediction = undersample_model.predict(undersample_X[test])

    undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), rand_xgb) # SMOTE happens during Cross Validation not before..
    undersample_model = undersample_pipeline.fit(undersampled_X_train[train], undersample_y_train[train])
    best_est = rand_xgb.best_estimator_
    
    undersample_prediction = undersample_model.predict(undersampled_X_train[test])
    # undersample_accuracy.append(undersample_pipeline.score(undersampled_X_train[test], undersample_y_train[test]))
    undersample_accuracy.append(accuracy_score(undersample_y_train[test], undersample_prediction))
    undersample_precision.append(precision_score(undersample_y_train[test], undersample_prediction))
    undersample_recall.append(recall_score(undersample_y_train[test], undersample_prediction))
    undersample_f1.append(f1_score(undersample_y_train[test], undersample_prediction))
    undersample_auc.append(roc_auc_score(undersample_y_train[test], undersample_model.predict_proba(undersampled_X_train[test])[:, 1]))
    undersample_balanced_accuracy.append(balanced_accuracy_score(undersample_y_train[test], undersample_prediction))

    undersample_val_prediction = undersample_model.predict(undersampled_X_val)
    # undersample_val_accuracy.append(undersample_pipeline.score(undersampled_X_val, undersample_y_val))
    undersample_val_accuracy.append(accuracy_score(undersample_y_val, undersample_val_prediction))
    undersample_val_precision.append(precision_score(undersample_y_val, undersample_val_prediction))
    undersample_val_recall.append(recall_score(undersample_y_val, undersample_val_prediction))
    undersample_val_f1.append(f1_score(undersample_y_val, undersample_val_prediction))
    undersample_val_auc.append(roc_auc_score(undersample_y_val, undersample_model.predict_proba(undersampled_X_val)[:, 1]))
    undersample_val_balanced_accuracy.append(balanced_accuracy_score(undersample_y_val, undersample_val_prediction))



print("Training Data Accuracy Score: {:.2f}".format(np.mean(undersample_accuracy)))
print("Training Data Precision Score: {:.2f}".format(np.mean(undersample_precision)))
print("Training Data Recall Score: {:.2f}".format(np.mean(undersample_recall)))
print("Training Data F1 Score: {:.2f}".format(np.mean(undersample_f1)))
print("Training Data ROC AUC: {:.2f}".format(np.mean(undersample_auc)))
print("Training Data Balanced Accuracy Score: {:.2f}".format(np.mean(undersample_balanced_accuracy)))
print("*"*100)
print("Validation Data Accuracy Score: {:.2f}".format(np.mean(undersample_val_accuracy)))
print("Validation Data Precision Score: {:.2f}".format(np.mean(undersample_val_precision)))
print("Validation Data Recall Score: {:.2f}".format(np.mean(undersample_val_recall)))
print("Validation Data F1 Score: {:.2f}".format(np.mean(undersample_val_f1)))
print("Validation Data ROC AUC: {:.2f}".format(np.mean(undersample_val_auc)))
print("Validation Data Balanced Accuracy Score: {:.2f}".format(np.mean(undersample_val_balanced_accuracy)))

In [None]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

nm = NearMiss(sampling_strategy='majority')
X_train_nearmiss, y_train_nearmiss = nm.fit_resample(X_train.values, y_train.values)
X_val_nearmiss, y_val_nearmiss = nm.fit_resample(X_val.values, y_val.values)

modelfit(xgb1, X_train_nearmiss, y_train_nearmiss, X_val_nearmiss, y_val_nearmiss)

## Use Oversampling to Deal with Imbalanced Dataset

In [None]:
# oversampled_data = data.copy()

# # Label encoding for categoricals
# for colname in oversampled_data.select_dtypes("object"):
#     oversampled_data[colname], _ = oversampled_data[colname].factorize()

# oversampled_X = oversampled_data.drop(['target', 'request_datetime'], axis=1)
# oversampled_y = oversampled_data['target']

smt = SMOTE(sampling_strategy='minority')
X_train_smote, y_train_smote = smt.fit_resample(X_train.values, y_train.values)
X_val_smote, y_val_smote = smt.fit_resample(X_val.values, y_val.values)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train_smote, X_val_smote, y_train_smote, y_val_smote)

print(models)

In [None]:
# undersampled_data = data.copy()
oversampled_X_train = X_train.copy()
oversampled_X_val = X_val.copy()


# Classifier with optimal parameters
# xgb_sm = gsearch7.best_estimator_
# xgb_sm = XGBClassifier(
#  learning_rate =0.1,
#  n_estimators=1000,
#  max_depth=5,
#  min_child_weight=1,
#  gamma=0,
#  subsample=0.8,
#  colsample_bytree=0.8,
#  objective= 'binary:logistic',
#  nthread=4,
#  scale_pos_weight=1,
#  seed=27)

oversampled_X_train = oversampled_X_train.values
oversampled_X_val = oversampled_X_val.values
oversample_y_train = y_train.values
oversample_y_val = y_val.values

oversample_accuracy = []
oversample_precision = []
oversample_recall = []
oversample_f1 = []
oversample_auc = []
oversample_balanced_accuracy = []

oversample_val_accuracy = []
oversample_val_precision = []
oversample_val_recall = []
oversample_val_f1 = []
oversample_val_auc = []
oversample_val_balanced_accuracy = []

# Implementing SMOTE Technique
# Cross Validating the right way
# Parameters
xgb_params = {
 'learning_rate' :[1e-3, 1e-2, 0.1, 0.2, 0.3],
 'n_estimators' : [10, 100, 1000, 10000],
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2),
 'gamma':[i/10.0 for i in range(0,5)],
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)],
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]      
 }

rand_xgb = RandomizedSearchCV(XGBClassifier(objective= 'binary:logistic'), xgb_params, n_iter=4)


for train, test in sss.split(oversampled_X_train, oversample_y_train):
    oversample_pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_xgb) # SMOTE happens during Cross Validation not before..
    oversample_model = oversample_pipeline.fit(oversampled_X_train[train], oversample_y_train[train])
    best_est = rand_xgb.best_estimator_
    
    oversample_prediction = oversample_model.predict(oversampled_X_train[test])
    # oversample_accuracy.append(oversample_pipeline.score(oversampled_X_train[test], oversample_y_train[test]))
    oversample_accuracy.append(accuracy_score(oversample_y_train[test], oversample_prediction))
    oversample_precision.append(precision_score(oversample_y_train[test], oversample_prediction))
    oversample_recall.append(recall_score(oversample_y_train[test], oversample_prediction))
    oversample_f1.append(f1_score(oversample_y_train[test], oversample_prediction))
    oversample_auc.append(roc_auc_score(oversample_y_train[test], oversample_model.predict_proba(oversampled_X_train[test])[:, 1]))
    oversample_balanced_accuracy.append(balanced_accuracy_score(oversample_y_train[test], oversample_prediction))

    oversample_val_prediction = oversample_model.predict(oversampled_X_val)
    # oversample_val_accuracy.append(oversample_pipeline.score(oversampled_X_val, oversample_y_val))
    oversample_val_accuracy.append(accuracy_score(oversample_y_val, oversample_val_prediction))
    oversample_val_precision.append(precision_score(oversample_y_val, oversample_val_prediction))
    oversample_val_recall.append(recall_score(oversample_y_val, oversample_val_prediction))
    oversample_val_f1.append(f1_score(oversample_y_val, oversample_val_prediction))
    oversample_val_auc.append(roc_auc_score(oversample_y_val, oversample_model.predict_proba(oversampled_X_val)[:, 1]))
    oversample_val_balanced_accuracy.append(balanced_accuracy_score(oversample_y_val, oversample_val_prediction))


print("Training Data Accuracy Score: {:.2f}".format(np.mean(oversample_accuracy)))
print("Training Data Precision Score: {:.2f}".format(np.mean(oversample_precision)))
print("Training Data Recall Score: {:.2f}".format(np.mean(oversample_recall)))
print("Training Data F1 Score: {:.2f}".format(np.mean(oversample_f1)))
print("Training Data ROC AUC: {:.2f}".format(np.mean(oversample_auc)))
print("Training Data Balanced Accuracy Score: {:.2f}".format(np.mean(oversample_balanced_accuracy)))
print("*"*100)
print("Validation Data Accuracy Score: {:.2f}".format(np.mean(oversample_val_accuracy)))
print("Validation Data Precision Score: {:.2f}".format(np.mean(oversample_val_precision)))
print("Validation Data Recall Score: {:.2f}".format(np.mean(oversample_val_recall)))
print("Validation Data F1 Score: {:.2f}".format(np.mean(oversample_val_f1)))
print("Validation Data ROC AUC: {:.2f}".format(np.mean(oversample_val_auc)))
print("Validation Data Balanced Accuracy Score: {:.2f}".format(np.mean(oversample_val_balanced_accuracy)))

In [None]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

smt = SMOTE(sampling_strategy='minority')
X_train_smote, y_train_smote = smt.fit_resample(X_train.values, y_train.values)
X_val_smote, y_val_smote = smt.fit_resample(X_val.values, y_val.values)

modelfit(xgb1, X_train_smote, y_train_smote, X_val_smote, y_val_smote)