In [1]:
#~/Library/Jupyter/nbextensions/snippets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
from tqdm import tqdm_notebook, tqdm
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, mean_absolute_error as mae, mean_squared_error as mse
import xgboost
from xgboost import DMatrix
import catboost
from sklearn.model_selection import StratifiedKFold
from semenov import *
import gc
from sklearn.model_selection import KFold



In [2]:
train = pd.read_csv("CAX_TrainingData_McK.csv", na_values=-1)
train.head()

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,ride_type_desc,driver_response
0,1105373,5,20,6080,174182,55.818842,37.334562,55.814567,37.35501,,,Economy,private,0
1,759733,5,14,6080,358774,55.805342,37.515023,55.819329,37.466398,18.802,25.217,Standard,private,1
2,416977,6,14,6080,866260,55.813978,37.347688,55.814827,37.354074,6.747,9.8,Economy,private,0
3,889660,2,6,6080,163522,55.745922,37.421748,55.743469,37.43113,,,Economy,private,1
4,1120055,4,16,6080,506710,55.803578,37.521602,55.812559,37.527407,12.383,19.25,Economy,private,1


In [3]:
test = pd.read_csv("CAX_TestData_McK.csv", na_values=-1)
test.head(2)

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,ride_type_desc,driver_response
0,152446,5,0,5021,648419,55.763302,37.593368,55.75823,37.613689,17.445,24.367,Premium,private,
1,281031,5,7,5817,405907,55.75547,37.648689,55.741544,37.622868,,,Economy,private,


In [4]:
driver_offer_count = train.groupby("driver_gk")["offer_gk"].agg({"offer_number": "mean"})

train = train.merge(driver_offer_count, left_on="driver_gk",
                    right_index=True)
test = test.merge(driver_offer_count, how="left", left_on="driver_gk",
                  right_index=True)

for column in ["hour_key", "driver_latitude", "driver_longitude",
               "origin_order_latitude", "origin_order_longitude",
               "distance_km"]:
    
    current_mean = train.groupby("driver_gk")[column].agg({column + "_mean": "mean", 
                                                           column + "_std": "std"}).reset_index()
    train = train.merge(current_mean, on="driver_gk")
    test = test.merge(current_mean, how="left", on="driver_gk", )

In [5]:
train.head(2)

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,...,driver_latitude_mean,driver_latitude_std,driver_longitude_mean,driver_longitude_std,origin_order_latitude_mean,origin_order_latitude_std,origin_order_longitude_mean,origin_order_longitude_std,distance_km_std,distance_km_mean
0,1105373,5,20,6080,174182,55.818842,37.334562,55.814567,37.35501,,...,55.80767,0.061009,37.469685,0.12658,55.806782,0.059814,37.471784,0.122264,10.123053,13.486
1,759733,5,14,6080,358774,55.805342,37.515023,55.819329,37.466398,18.802,...,55.80767,0.061009,37.469685,0.12658,55.806782,0.059814,37.471784,0.122264,10.123053,13.486


In [6]:
test.head(2)

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,...,driver_latitude_mean,driver_latitude_std,driver_longitude_mean,driver_longitude_std,origin_order_latitude_mean,origin_order_latitude_std,origin_order_longitude_mean,origin_order_longitude_std,distance_km_std,distance_km_mean
0,152446,5,0,5021,648419,55.763302,37.593368,55.75823,37.613689,17.445,...,55.759794,0.042962,37.575799,0.061878,55.757882,0.043078,37.576099,0.071145,13.179103,13.16631
1,281031,5,7,5817,405907,55.75547,37.648689,55.741544,37.622868,,...,55.717128,0.120089,37.6315,0.139406,55.715542,0.124105,37.632219,0.140337,17.34029,16.215314


In [7]:
def ride_type_desc_transform(x):
    if x=="private":
        return 0
    elif x=="business":
        return 1
    else:
        return 2
    
def offer_class_group_transform(x):
    if x=="Economy":
        return 0
    return 1
    
y = train.pop("driver_response")
test.pop("driver_response")

train.drop(["offer_gk", "order_gk"], axis=1, inplace=True)
test.drop(["offer_gk", "order_gk"], axis=1, inplace=True)

test.ride_type_desc = test.ride_type_desc.apply(ride_type_desc_transform)
test.offer_class_group = test.offer_class_group.apply(offer_class_group_transform)

train.ride_type_desc = train.ride_type_desc.apply(ride_type_desc_transform)
train.offer_class_group = train.offer_class_group.apply(offer_class_group_transform)

In [8]:
train.head(2)

Unnamed: 0,weekday_key,hour_key,driver_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,...,driver_latitude_mean,driver_latitude_std,driver_longitude_mean,driver_longitude_std,origin_order_latitude_mean,origin_order_latitude_std,origin_order_longitude_mean,origin_order_longitude_std,distance_km_std,distance_km_mean
0,5,20,6080,55.818842,37.334562,55.814567,37.35501,,,0,...,55.80767,0.061009,37.469685,0.12658,55.806782,0.059814,37.471784,0.122264,10.123053,13.486
1,5,14,6080,55.805342,37.515023,55.819329,37.466398,18.802,25.217,1,...,55.80767,0.061009,37.469685,0.12658,55.806782,0.059814,37.471784,0.122264,10.123053,13.486


In [9]:
test.head(2)

Unnamed: 0,weekday_key,hour_key,driver_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,...,driver_latitude_mean,driver_latitude_std,driver_longitude_mean,driver_longitude_std,origin_order_latitude_mean,origin_order_latitude_std,origin_order_longitude_mean,origin_order_longitude_std,distance_km_std,distance_km_mean
0,5,0,5021,55.763302,37.593368,55.75823,37.613689,17.445,24.367,1,...,55.759794,0.042962,37.575799,0.061878,55.757882,0.043078,37.576099,0.071145,13.179103,13.16631
1,5,7,5817,55.75547,37.648689,55.741544,37.622868,,,0,...,55.717128,0.120089,37.6315,0.139406,55.715542,0.124105,37.632219,0.140337,17.34029,16.215314


In [10]:
STAS_CONST = 5

def semenov_prep(X_train, y_train, X_val, features, C=STAS_CONST):
    se = SemenovEncoding(C=C)
    se.fit(X_train, y_train, features=features)
    X_val = se.transform(X_val)
    
    # разбиваю X_train на 3 куска и считаю правильно фичи Стаса.
    X_train_new = pd.DataFrame(index=X_train.index, columns=X_val.columns, dtype=np.float64)
    
    kf = KFold(n_splits=4, shuffle=True, random_state=32578)
    for itr, ite in kf.split(X_train):
        se = SemenovEncoding(C=C)
        se.fit(X_train.iloc[itr], y_train.iloc[itr], features=features)
        X_train_new.iloc[ite] = se.transform(X_train.iloc[ite])
        
    gc.collect()
    return X_train_new, X_val

def semenov_prep_test(X, y, X_test, features, C=STAS_CONST):
    se = SemenovEncoding(C=C)
    se.fit(X, y, features=features)
    X_test = se.transform(X_test)
    X_test.drop(list(set(features) &  set(X_test.columns) - set(["driver_gk"])), axis=1, inplace=True)
    gc.collect()
    return X_test
    
def data_prep(X_train, y_train, X_val, y_val, stas=[]):
    X_train, X_val = semenov_prep(X_train, y_train, X_val, features=stas)
    X_train.drop(list(set(stas) &  set(X_train.columns) - set(["driver_gk"])), axis=1, inplace=True)
    X_val.drop(list(set(stas) &  set(X_val.columns) - set(["driver_gk"])), axis=1, inplace=True)
    gc.collect()
    return X_train, y_train, X_val, y_val

In [11]:
cross_val = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [12]:
stas = ["weekday_key", "hour_key", "driver_gk",
        "offer_class_group", "ride_type_desc"]

In [13]:
new_stas = stas[:]

for column_first in tqdm_notebook(stas):
    for column_second in tqdm_notebook(stas):
        if column_first != column_second:
            if column_first + column_second not in train.columns:
                new_col = column_first + "_" + column_second
                train[new_col] = train[column_first].apply(lambda x: str(x) + "_")
                train[new_col] = train[new_col] + train[column_second].apply(lambda x: str(x))
                
                test[new_col] = test[column_first].apply(lambda x: str(x) + "_")
                test[new_col] = test[new_col] + test[column_second].apply(lambda x: str(x))
                new_stas.append(new_col)
                
stas = new_stas




In [14]:
test = semenov_prep_test(train, y, test, features=stas)
test.head(10)

fitting: 100%|██████████| 25/25 [00:03<00:00,  7.78it/s]
merging: 100%|██████████| 25/25 [00:04<00:00,  5.14it/s]


Unnamed: 0,driver_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_number,hour_key_std,hour_key_mean,...,stas_driver_gk_hour_key,stas_offer_class_group_ride_type_desc,stas_weekday_key_offer_class_group,stas_hour_key,stas_ride_type_desc_hour_key,stas_weekday_key_driver_gk,stas_hour_key_weekday_key,stas_ride_type_desc_driver_gk,stas_driver_gk_weekday_key,stas_ride_type_desc_weekday_key
0,5021,55.763302,37.593368,55.75823,37.613689,17.445,24.367,561106.909091,7.031339,12.690909,...,0.633461,0.774405,0.755247,0.613477,0.596119,0.780077,0.595442,0.859638,0.780077,0.706146
1,5817,55.75547,37.648689,55.741544,37.622868,0.740231,0.740231,573982.617391,7.33935,11.513043,...,0.808429,0.695638,0.660008,0.738038,0.736077,0.856705,0.70152,0.925236,0.856705,0.706146
2,3870,55.619002,37.59614,55.615923,37.607872,0.740231,0.740231,526322.983051,7.093203,13.618644,...,0.856322,0.819079,0.755247,0.670344,0.768657,0.879572,0.651374,0.948046,0.879572,0.759931
3,5607,55.620905,37.60655,55.614517,37.591161,0.740231,0.740231,585624.136986,4.819579,15.739726,...,0.91341,0.774405,0.755247,0.811248,0.80376,0.860026,0.778389,0.885219,0.860026,0.706146
4,3786,55.893228,37.673552,55.888084,37.662087,0.740231,0.740231,586270.377778,6.518044,13.822222,...,0.918822,0.774405,0.755247,0.811248,0.80376,0.713689,0.778389,0.874404,0.713689,0.706146
5,1439,55.898767,37.668912,55.888084,37.662087,0.740231,0.740231,582403.729412,4.768797,12.9,...,0.850052,0.774405,0.755247,0.811248,0.80376,0.892529,0.778389,0.899304,0.892529,0.706146
6,1602,55.604987,37.522931,55.607802,37.546758,0.740231,0.740231,587677.071429,1.341128,8.357143,...,0.943528,0.774405,0.755247,0.763861,0.760114,0.93815,0.721508,0.983965,0.93815,0.706146
7,6099,55.611884,37.538272,55.607802,37.546758,0.740231,0.740231,574243.320312,3.318664,12.296875,...,0.842912,0.774405,0.755247,0.763861,0.760114,0.788046,0.721508,0.873563,0.788046,0.706146
8,6074,55.764961,37.564141,55.764557,37.600896,0.740231,0.740231,503858.942446,5.628472,13.582734,...,0.891763,0.774405,0.755247,0.811248,0.80376,0.975022,0.778389,0.980682,0.975022,0.706146
9,3433,55.699544,37.501539,55.703083,37.517197,0.740231,0.740231,551934.775362,3.531732,13.963768,...,0.927842,0.774405,0.755247,0.811215,0.802812,0.928161,0.786972,0.982715,0.928161,0.706146


In [24]:
print(train.shape)
print(test.shape)

(892557, 44)
(237813, 45)


In [25]:
param = {}
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'auc'
param['max_depth'] = 5
param['eta'] = 0.05
param['subsample'] = 0.6
param['colsample_bytree'] = 0.6
param['nthread'] = 15
param['alpha'] = 5
# param['lambda_bias'] = 0.1
param['lambda'] = 5
# param['min_child_weight'] = 5


# early_stopping_rounds = int(1000 / param['eta'] ** 0.5)

In [26]:
test.head()

Unnamed: 0,driver_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_number,hour_key_std,hour_key_mean,...,stas_driver_gk_hour_key,stas_offer_class_group_ride_type_desc,stas_weekday_key_offer_class_group,stas_hour_key,stas_ride_type_desc_hour_key,stas_weekday_key_driver_gk,stas_hour_key_weekday_key,stas_ride_type_desc_driver_gk,stas_driver_gk_weekday_key,stas_ride_type_desc_weekday_key
0,5021,55.763302,37.593368,55.75823,37.613689,17.445,24.367,561106.909091,7.031339,12.690909,...,0.633461,0.774405,0.755247,0.613477,0.596119,0.780077,0.595442,0.859638,0.780077,0.706146
1,5817,55.75547,37.648689,55.741544,37.622868,0.740231,0.740231,573982.617391,7.33935,11.513043,...,0.808429,0.695638,0.660008,0.738038,0.736077,0.856705,0.70152,0.925236,0.856705,0.706146
2,3870,55.619002,37.59614,55.615923,37.607872,0.740231,0.740231,526322.983051,7.093203,13.618644,...,0.856322,0.819079,0.755247,0.670344,0.768657,0.879572,0.651374,0.948046,0.879572,0.759931
3,5607,55.620905,37.60655,55.614517,37.591161,0.740231,0.740231,585624.136986,4.819579,15.739726,...,0.91341,0.774405,0.755247,0.811248,0.80376,0.860026,0.778389,0.885219,0.860026,0.706146
4,3786,55.893228,37.673552,55.888084,37.662087,0.740231,0.740231,586270.377778,6.518044,13.822222,...,0.918822,0.774405,0.755247,0.811248,0.80376,0.713689,0.778389,0.874404,0.713689,0.706146


In [27]:
train.head()

Unnamed: 0,weekday_key,hour_key,driver_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,...,driver_gk_offer_class_group,driver_gk_ride_type_desc,offer_class_group_weekday_key,offer_class_group_hour_key,offer_class_group_driver_gk,offer_class_group_ride_type_desc,ride_type_desc_weekday_key,ride_type_desc_hour_key,ride_type_desc_driver_gk,ride_type_desc_offer_class_group
0,5,20,6080,55.818842,37.334562,55.814567,37.35501,,,0,...,6080_0,6080_0,0_5,0_20,0_6080,0_0,0_5,0_20,0_6080,0_0
1,5,14,6080,55.805342,37.515023,55.819329,37.466398,18.802,25.217,1,...,6080_1,6080_0,1_5,1_14,1_6080,1_0,0_5,0_14,0_6080,0_1
2,6,14,6080,55.813978,37.347688,55.814827,37.354074,6.747,9.8,0,...,6080_0,6080_0,0_6,0_14,0_6080,0_0,0_6,0_14,0_6080,0_0
3,2,6,6080,55.745922,37.421748,55.743469,37.43113,,,0,...,6080_0,6080_0,0_2,0_6,0_6080,0_0,0_2,0_6,0_6080,0_0
4,4,16,6080,55.803578,37.521602,55.812559,37.527407,12.383,19.25,0,...,6080_0,6080_0,0_4,0_16,0_6080,0_0,0_4,0_16,0_6080,0_0


In [30]:
bsts = []
train_predictions = np.zeros(len(y))
test_predictions = []

for itr, ite in cross_val.split(train, y):    
    X_train, y_train, X_test, y_test = data_prep(train.iloc[itr], y.iloc[itr],
                                                 train.iloc[ite], y.iloc[ite],
                                                 stas=stas)
    
    Xdatatrain = xgboost.DMatrix(data=X_train, label=y_train)
    Xdataval = xgboost.DMatrix(data=X_test, label=y_test)
    watchlist = [(Xdatatrain, 'train'), (Xdataval, 'eval')]
    bst = xgboost.train(list(param.items()), Xdatatrain, 500, 
                        evals=watchlist, verbose_eval=50)
    
    Xdatatest = xgboost.DMatrix(test)
    
    bsts.append(bst)
    train_predictions[ite] = bst.predict(Xdataval)
    test_predictions.append(bst.predict(Xdatatest))
#     break

fitting: 100%|██████████| 25/25 [00:01<00:00, 15.23it/s]
merging: 100%|██████████| 25/25 [00:06<00:00,  3.38it/s]
fitting: 100%|██████████| 25/25 [00:01<00:00, 15.22it/s]
merging: 100%|██████████| 25/25 [00:04<00:00,  5.27it/s]
fitting: 100%|██████████| 25/25 [00:01<00:00, 14.78it/s]
merging: 100%|██████████| 25/25 [00:05<00:00,  4.87it/s]
fitting: 100%|██████████| 25/25 [00:01<00:00, 17.57it/s]
merging: 100%|██████████| 25/25 [00:04<00:00,  5.58it/s]
fitting: 100%|██████████| 25/25 [00:01<00:00, 17.55it/s]
merging: 100%|██████████| 25/25 [00:05<00:00,  3.75it/s]


[0]	train-auc:0.852219	eval-auc:0.852986
[50]	train-auc:0.862698	eval-auc:0.863097
[100]	train-auc:0.867189	eval-auc:0.867132
[150]	train-auc:0.869481	eval-auc:0.869069
[200]	train-auc:0.871095	eval-auc:0.870367
[250]	train-auc:0.872252	eval-auc:0.871134
[300]	train-auc:0.873259	eval-auc:0.871828
[350]	train-auc:0.874209	eval-auc:0.872415
[400]	train-auc:0.875043	eval-auc:0.872852
[450]	train-auc:0.875879	eval-auc:0.873289
[499]	train-auc:0.876569	eval-auc:0.873583


In [31]:
preds = np.mean(test_predictions, axis=0)

In [32]:
len(preds)

237813

In [33]:
len(test)

237813

In [34]:
sample_submission = pd.read_csv("McK_SubmissionFormat.csv")
sample_submission.head()

Unnamed: 0,offer_gk,driver_response
0,152446,
1,281031,
2,779964,
3,16720,
4,492087,


In [35]:
sample_submission.driver_response = preds

In [37]:
sample_submission.to_csv("second_submit.csv", index=None)

In [38]:
ss = pd.read_csv("first_submit.csv")
ss.head()

Unnamed: 0,offer_gk,driver_response
0,152446,0.775272
1,281031,0.913362
2,779964,0.95946
3,16720,0.947089
4,492087,0.945322
