# **0- Setup**

In [1]:
!nvidia-smi

Mon Mar 29 05:42:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from zipfile import ZipFile
path = '/content/drive/MyDrive/UHSendy.zip'
with ZipFile(path) as zf:
    zf.extractall(pwd=b'lpdsv')

In [23]:
!pip install catboost==0.22 --quiet

[K     |████████████████████████████████| 64.4MB 45kB/s 
[?25h

In [1]:
import pandas as  pd 
import numpy as np 
import warnings
warnings.simplefilter('ignore')

from sklearn.metrics import accuracy_score
import random

from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier ,Pool
from sklearn.model_selection import train_test_split

In [6]:
class CFG :
  catboost_params = {'learning_rate' :0.05,'iterations':10000,'random_seed':0,
                     'use_best_model':True,'verbose':100,
                     'task_type':"GPU",'devices':'0:1' }
  catfs    = ['order_id','client_id','rider_id','client_type']
  to_drop  = ['ID','target','vendor_type','dispatch_time',"dispatch_hour"]
  tts_seed = 3031 
  tts_ts   = 0.2

# **1- Pre-Processing**

## **1.1 Utils**

In [7]:
def seed_env(seed = 0):
  '''SEED The envierment'''
  random.seed(seed)
  np.random.seed(seed)

In [8]:
def bearing(lat1, lng1, lat2, lng2):
    lon_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    
    return np.degrees(np.arctan2(np.sin(lon_rad) * np.cos(lat2), np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lon_rad)))

In [9]:
def get_time_features(data):
  data['dispatch_time'] = pd.to_datetime(data['dispatch_time'])
  data['dispatch_hour'] = data['dispatch_time'].dt.hour

In [10]:
def process(train,test,riders) :
  
  data = pd.concat([train,test]).reset_index(drop=True)
  # process riders 
  riders = riders.rename(columns = {'Rider ID' : "rider_id"})
  
  # Merge rider dataset to train and test sets
  data = data.merge(riders, on = 'rider_id', how='left')
  
  # add hour feature
  get_time_features(data)

  # create combination feature
  data['comb1'] = data['rider_id'].astype(str) + '-' + data['dispatch_hour'].astype(str)

  # Other Feature engineering
  data['RatingFactor_Rider'] = data['Average Partner Rating']*data['Number of Ratings']

  # apply label encoder 
  LE = LabelEncoder()
  LE_features = ['client_type','comb1',]
  for feature in LE_features :
    data[feature] = LE.fit_transform(data[feature].astype(str))

  # create distances features 
  data['bearing_Rider_PickUp'] = bearing(data['rider_lat'], data['rider_long'], data['pickup_lat'], data['pickup_long'])

  # Feature based on some eda 
  data['feature_eng_1'] = data['Number of Ratings'].apply(lambda x : 1 if x>150 else 0)
  data['feature_eng_2'] = data['Average Partner Rating'].apply(lambda x : 1 if x>21 else 0)

  # create agg per order_id 
  distance_statistics = data.groupby(by="order_id",as_index=False).agg({'bearing_Rider_PickUp':['mean','min','max','std'],})
  distance_statistics.columns = ["_By_OrderId_".join(x) for x in distance_statistics.columns.ravel()]
  data = pd.merge(data,distance_statistics,how="left",right_on="order_id_By_OrderId_",left_on="order_id")
  del data['order_id_By_OrderId_']

  # cyclic features 
  data['dispatch_day_cos'] = np.cos(data['dispatch_day'])
  data['dispatch_day_sin'] = np.sin(data['dispatch_day']) 

  # get train , test
  train = data[data['ID'].isin(train['ID'].values)]
  test = data[~data['ID'].isin(train['ID'].values)]
  return train , test 

## **1.2 Create Processed Train-Test**

In [11]:
train = pd.read_csv('UHSendy/Train.csv')
test = pd.read_csv('UHSendy/Test.csv')
riders = pd.read_csv('UHSendy/Riders.csv')

In [12]:
%%time
train , test  = process(train,test,riders)

CPU times: user 12.5 s, sys: 131 ms, total: 12.7 s
Wall time: 12.7 s


In [13]:
train.shape , test.shape

((179867, 36), (76791, 36))

In [14]:
train.head()

Unnamed: 0,ID,order_id,dispatch_day,dispatch_day_of_week,dispatch_time,client_id,client_type,order_license_status,order_carrier_type,vendor_type,rider_id,rider_license_status,rider_carrier_type,rider_amount,rider_lat,rider_long,pickup_lat,pickup_long,drop_off_lat,drop_off_long,target,Active Rider Age,Average Partner Rating,Number of Ratings,dispatch_hour,comb1,RatingFactor_Rider,bearing_Rider_PickUp,feature_eng_1,feature_eng_2,bearing_Rider_PickUp_By_OrderId_mean,bearing_Rider_PickUp_By_OrderId_min,bearing_Rider_PickUp_By_OrderId_max,bearing_Rider_PickUp_By_OrderId_std,dispatch_day_cos,dispatch_day_sin
0,ID_SCUW21PVAU,4435,27,6,2021-03-29 09:02:54,593630,0,0,2,Bike,30153,0,1,1080,-42.698343,-17.228539,-42.692371,-17.248305,-42.687442,-17.424682,1.0,11,10.0,1,9,3076,10.0,-67.659597,0,0,-52.037239,-70.030817,-18.421302,29.136388,-0.292139,0.956376
1,ID_2HA7X30JMN,32711,30,7,2021-03-29 13:01:37,837729,1,0,1,Bike,20884,0,1,730,-42.787317,-17.288252,-42.784046,-17.290121,-42.673267,-17.234595,2.0,68,24.13,229,13,2743,5525.77,-22.752722,1,1,-22.752722,-22.752722,-22.752722,,0.154251,-0.988032
2,ID_IAJWDTBY6M,8712,14,2,2021-03-29 10:01:00,695129,1,0,2,Bike,33143,1,1,490,-42.74918,-17.287848,-42.765204,-17.293784,-42.813953,-17.294805,1.0,273,24.92,123,10,3163,3065.16,-164.786368,0,1,-19.722334,-164.786368,165.004835,110.149486,0.136737,0.990607
3,ID_LKSVPNYMTR,44869,22,3,2021-03-29 14:11:16,1504660,1,0,2,Bike,96531,1,1,510,-42.836266,-17.31192,-42.831913,-17.315311,-42.812409,-17.265441,2.0,168,23.76,175,14,6926,4158.0,-29.736604,1,1,13.261555,-29.736604,56.259714,60.80858,-0.999961,-0.008851
4,ID_O7N8Y918YH,57590,27,5,2021-03-29 16:11:38,36869,0,0,2,Bike,103546,0,0,400,-42.828195,-17.322818,-42.836056,-17.318111,-42.828517,-17.302052,0.0,95,24.53,42,16,583,1030.26,156.29368,0,1,31.471027,-140.023959,179.043252,122.179963,-0.292139,0.956376


# **2-Modeling**

In [15]:
features = [x for x in train.columns if x not in CFG.to_drop]
len(features)

31

In [16]:
X = train[features]
y = train.target
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = CFG.tts_ts,random_state=CFG.tts_seed,stratify=y)

In [19]:
def Catboost(X_train, X_test, y_train, y_test,categorical_features) :
  seed_env()

  model = CatBoostClassifier(**CFG.catboost_params)
  model.fit(Pool(X_train,y_train,cat_features = categorical_features),
            eval_set = Pool(X_test,y_test,cat_features = categorical_features),
            early_stopping_rounds=200)
  
  pred = model.predict(X_test)
  print('score : ' , accuracy_score(y_test,pred))
  return pred , model

In [20]:
_ , model = Catboost(X_train, X_test, y_train, y_test,CFG.catfs)

0:	learn: 1.0732264	test: 1.0743475	best: 1.0743475 (0)	total: 18.1ms	remaining: 3m 1s
100:	learn: 0.7899893	test: 0.7854243	best: 0.7854243 (100)	total: 1.39s	remaining: 2m 15s
200:	learn: 0.7749337	test: 0.7719595	best: 0.7719595 (200)	total: 2.52s	remaining: 2m 3s
300:	learn: 0.7651347	test: 0.7648160	best: 0.7648160 (300)	total: 3.66s	remaining: 1m 57s
400:	learn: 0.7576502	test: 0.7602079	best: 0.7602079 (400)	total: 4.77s	remaining: 1m 54s
500:	learn: 0.7512861	test: 0.7569831	best: 0.7569831 (500)	total: 5.84s	remaining: 1m 50s
600:	learn: 0.7459381	test: 0.7546436	best: 0.7546436 (600)	total: 6.91s	remaining: 1m 48s
700:	learn: 0.7410864	test: 0.7526755	best: 0.7526755 (700)	total: 7.99s	remaining: 1m 45s
800:	learn: 0.7370023	test: 0.7513401	best: 0.7513401 (800)	total: 9.11s	remaining: 1m 44s
900:	learn: 0.7328316	test: 0.7500171	best: 0.7500171 (900)	total: 10.2s	remaining: 1m 43s
1000:	learn: 0.7290066	test: 0.7489914	best: 0.7489914 (1000)	total: 11.3s	remaining: 1m 41s
11

In [21]:
Catb_Predictions = model.predict(test[features])

In [22]:
# create submission file 
submission_cat = pd.DataFrame()
submission_cat['ID'] = test['ID']
submission_cat['target'] = Catb_Predictions
submission_cat['target'] = submission_cat['target'].astype('int')

In [23]:
submission_cat.to_csv('UmojaHack_Challenge#2_Top_5_Notebook.csv',index=False)