In [1]:
#~/Library/Jupyter/nbextensions/snippets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
from tqdm import tqdm_notebook, tqdm
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, mean_absolute_error as mae, mean_squared_error as mse
import xgboost
from xgboost import DMatrix
import catboost
from sklearn.model_selection import StratifiedKFold
from semenov import *
import gc
from sklearn.model_selection import KFold



In [19]:
train = pd.read_csv("CAX_TrainingData_McK.csv", na_values=-1)
train.head()

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,ride_type_desc,driver_response
0,1105373,5,20,6080,174182,55.818842,37.334562,55.814567,37.35501,,,Economy,private,0
1,759733,5,14,6080,358774,55.805342,37.515023,55.819329,37.466398,18.802,25.217,Standard,private,1
2,416977,6,14,6080,866260,55.813978,37.347688,55.814827,37.354074,6.747,9.8,Economy,private,0
3,889660,2,6,6080,163522,55.745922,37.421748,55.743469,37.43113,,,Economy,private,1
4,1120055,4,16,6080,506710,55.803578,37.521602,55.812559,37.527407,12.383,19.25,Economy,private,1


In [25]:
driver_offer_count = train.groupby("driver_gk")["offer_gk"].count().reset_index()
for column in ["hour_key", "driver_latitude", "driver_longitude",
               "origin_order_latitude", "origin_order_longitude",
               "distance_km"]:
    current_mean = train.groupby("driver_gk")[column].mean().reset_index()
    current_mean.se
    current_std = train.groupby("driver_gk")[column].std()
    train = train.merge(current_mean, right_index=True)
    train = train.merge(current_std, right_index=True)

ValueError: can not merge DataFrame with instance of type <class 'pandas.core.series.Series'>

In [3]:
def ride_type_desc_transform(x):
    if x=="private":
        return 0
    elif x=="business":
        return 1
    else:
        return 2
    
def offer_class_group_transform(x):
    if x=="Economy":
        return 0
    return 1
    
train.ride_type_desc = train.ride_type_desc.apply(ride_type_desc_transform)
train.offer_class_group = train.offer_class_group.apply(offer_class_group_transform)
train.ride_type_desc.unique()

array([0, 1, 2])

In [8]:
STAS_CONST = 5

def semenov_prep(X_train, y_train, X_val, features, C=STAS_CONST):
    se = SemenovEncoding(C=C)
    se.fit(X_train, y_train, features=features)
    X_val = se.transform(X_val)
    
    # разбиваю X_train на 3 куска и считаю правильно фичи Стаса.
    X_train_new = pd.DataFrame(index=X_train.index, columns=X_val.columns, dtype=np.float64)
    
    kf = KFold(n_splits=4, shuffle=True, random_state=32578)
    for itr, ite in kf.split(X_train):
        se = SemenovEncoding(C=C)
        se.fit(X_train.iloc[itr], y_train.iloc[itr], features=features)
        X_train_new.iloc[ite] = se.transform(X_train.iloc[ite])
        
    gc.collect()
    return X_train_new, X_val

def semenov_prep_test(X, y, X_test, features, C=STAS_CONST):
    se = SemenovEncoding(C=C)
    se.fit(X, y, features=features)
    X_test = se.transform(X_test)
    X_test.drop(list(set(features) &  set(X_test.columns) - set(["driver_gk"])), axis=1, inplace=True)
    gc.collect()
    return X_test
    
def data_prep(X_train, y_train, X_val, y_val, stas=[]):
    X_train, X_val = semenov_prep(X_train, y_train, X_val, features=stas)
    X_train.drop(list(set(stas) &  set(X_train.columns) - set(["driver_gk"])), axis=1, inplace=True)
    X_val.drop(list(set(stas) &  set(X_val.columns) - set(["driver_gk"])), axis=1, inplace=True)
    gc.collect()
    return X_train, y_train, X_val, y_val

In [9]:
cross_val = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
y = train.pop("driver_response")
train.drop(["offer_gk", "order_gk"], axis=1, inplace=True)

In [10]:
stas = ["weekday_key", "hour_key", "driver_gk",
        "offer_class_group", "ride_type_desc"]

In [11]:
test = pd.read_csv("CAX_TestData_McK.csv", na_values=-1)
test.drop(["offer_gk", "order_gk"], axis=1, inplace=True)
test.ride_type_desc = test.ride_type_desc.apply(ride_type_desc_transform)
test.offer_class_group = test.offer_class_group.apply(offer_class_group_transform)
test.ride_type_desc.unique()
test.head()

Unnamed: 0,weekday_key,hour_key,driver_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,ride_type_desc,driver_response
0,5,0,5021,55.763302,37.593368,55.75823,37.613689,17.445,24.367,1,0,
1,5,7,5817,55.75547,37.648689,55.741544,37.622868,,,0,0,
2,5,1,3870,55.619002,37.59614,55.615923,37.607872,,,1,1,
3,5,13,5607,55.620905,37.60655,55.614517,37.591161,,,1,0,
4,5,13,3786,55.893228,37.673552,55.888084,37.662087,,,1,0,


In [12]:
new_stas = stas[:]

for column_first in tqdm_notebook(stas):
    for column_second in tqdm_notebook(stas):
        if column_first != column_second:
            if column_first + column_second not in train.columns:
                new_col = column_first + "_" + column_second
                train[new_col] = train[column_first].apply(lambda x: str(x) + "_")
                train[new_col] = train[new_col] + train[column_second].apply(lambda x: str(x))
                
                test[new_col] = test[column_first].apply(lambda x: str(x) + "_")
                test[new_col] = test[new_col] + test[column_second].apply(lambda x: str(x))
                new_stas.append(new_col)
                
stas = new_stas




In [13]:
train.shape

(892557, 31)

In [14]:
test.shape

(237813, 32)

In [15]:
test = semenov_prep_test(train, y, test, features=stas)
test.head(10)

fitting: 100%|██████████| 25/25 [00:02<00:00, 11.99it/s]
merging: 100%|██████████| 25/25 [00:03<00:00,  5.69it/s]


Unnamed: 0,driver_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,driver_response,stas_offer_class_group_hour_key,stas_hour_key_offer_class_group,...,stas_driver_gk_hour_key,stas_offer_class_group_ride_type_desc,stas_weekday_key_offer_class_group,stas_hour_key,stas_ride_type_desc_hour_key,stas_weekday_key_driver_gk,stas_hour_key_weekday_key,stas_ride_type_desc_driver_gk,stas_driver_gk_weekday_key,stas_ride_type_desc_weekday_key
0,5021,55.763302,37.593368,55.75823,37.613689,17.445,24.367,0.740231,0.659312,0.659312,...,0.633461,0.774405,0.755247,0.613477,0.596119,0.780077,0.595442,0.859638,0.780077,0.706146
1,5817,55.75547,37.648689,55.741544,37.622868,0.740231,0.740231,0.740231,0.698971,0.698971,...,0.808429,0.695638,0.660008,0.738038,0.736077,0.856705,0.70152,0.925236,0.856705,0.706146
2,3870,55.619002,37.59614,55.615923,37.607872,0.740231,0.740231,0.740231,0.730623,0.730623,...,0.856322,0.819079,0.755247,0.670344,0.768657,0.879572,0.651374,0.948046,0.879572,0.759931
3,5607,55.620905,37.60655,55.614517,37.591161,0.740231,0.740231,0.740231,0.850277,0.850277,...,0.91341,0.774405,0.755247,0.811248,0.80376,0.860026,0.778389,0.885219,0.860026,0.706146
4,3786,55.893228,37.673552,55.888084,37.662087,0.740231,0.740231,0.740231,0.850277,0.850277,...,0.918822,0.774405,0.755247,0.811248,0.80376,0.713689,0.778389,0.874404,0.713689,0.706146
5,1439,55.898767,37.668912,55.888084,37.662087,0.740231,0.740231,0.740231,0.850277,0.850277,...,0.850052,0.774405,0.755247,0.811248,0.80376,0.892529,0.778389,0.899304,0.892529,0.706146
6,1602,55.604987,37.522931,55.607802,37.546758,0.740231,0.740231,0.740231,0.808296,0.808296,...,0.943528,0.774405,0.755247,0.763861,0.760114,0.93815,0.721508,0.983965,0.93815,0.706146
7,6099,55.611884,37.538272,55.607802,37.546758,0.740231,0.740231,0.740231,0.808296,0.808296,...,0.842912,0.774405,0.755247,0.763861,0.760114,0.788046,0.721508,0.873563,0.788046,0.706146
8,6074,55.764961,37.564141,55.764557,37.600896,0.740231,0.740231,0.740231,0.850277,0.850277,...,0.891763,0.774405,0.755247,0.811248,0.80376,0.975022,0.778389,0.980682,0.975022,0.706146
9,3433,55.699544,37.501539,55.703083,37.517197,0.740231,0.740231,0.740231,0.85089,0.85089,...,0.927842,0.774405,0.755247,0.811215,0.802812,0.928161,0.786972,0.982715,0.928161,0.706146


In [16]:
param = {}
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'auc'
param['max_depth'] = 5
param['eta'] = 0.05
param['subsample'] = 0.6
param['colsample_bytree'] = 0.6
param['nthread'] = 15
param['alpha'] = 5
# param['lambda_bias'] = 0.1
param['lambda'] = 5
# param['min_child_weight'] = 5


# early_stopping_rounds = int(1000 / param['eta'] ** 0.5)

In [17]:
test.head()

Unnamed: 0,driver_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,driver_response,stas_offer_class_group_hour_key,stas_hour_key_offer_class_group,...,stas_driver_gk_hour_key,stas_offer_class_group_ride_type_desc,stas_weekday_key_offer_class_group,stas_hour_key,stas_ride_type_desc_hour_key,stas_weekday_key_driver_gk,stas_hour_key_weekday_key,stas_ride_type_desc_driver_gk,stas_driver_gk_weekday_key,stas_ride_type_desc_weekday_key
0,5021,55.763302,37.593368,55.75823,37.613689,17.445,24.367,0.740231,0.659312,0.659312,...,0.633461,0.774405,0.755247,0.613477,0.596119,0.780077,0.595442,0.859638,0.780077,0.706146
1,5817,55.75547,37.648689,55.741544,37.622868,0.740231,0.740231,0.740231,0.698971,0.698971,...,0.808429,0.695638,0.660008,0.738038,0.736077,0.856705,0.70152,0.925236,0.856705,0.706146
2,3870,55.619002,37.59614,55.615923,37.607872,0.740231,0.740231,0.740231,0.730623,0.730623,...,0.856322,0.819079,0.755247,0.670344,0.768657,0.879572,0.651374,0.948046,0.879572,0.759931
3,5607,55.620905,37.60655,55.614517,37.591161,0.740231,0.740231,0.740231,0.850277,0.850277,...,0.91341,0.774405,0.755247,0.811248,0.80376,0.860026,0.778389,0.885219,0.860026,0.706146
4,3786,55.893228,37.673552,55.888084,37.662087,0.740231,0.740231,0.740231,0.850277,0.850277,...,0.918822,0.774405,0.755247,0.811248,0.80376,0.713689,0.778389,0.874404,0.713689,0.706146


In [18]:
bsts = []
train_predictions = np.zeros(len(y))
test_predictions = []

for itr, ite in cross_val.split(train, y):    
    X_train, y_train, X_test, y_test = data_prep(train.iloc[itr], y.iloc[itr],
                                                 train.iloc[ite], y.iloc[ite],
                                                 stas=stas)
    
    Xdatatrain = xgboost.DMatrix(data=X_train, label=y_train)
    Xdataval = xgboost.DMatrix(data=X_test, label=y_test)
    watchlist = [(Xdatatrain, 'train'), (Xdataval, 'eval')]
    bst = xgboost.train(list(param.items()), Xdatatrain, 500, 
                        evals=watchlist, verbose_eval=50)
    
    Xdatatest = xgboost.DMatrix(data=test.drop("driver_response", axis=1))
    
    bsts.append(bst)
    train_predictions[ite] = bst.predict(Xdataval)
    test_predictions.append(bst.predict(Xdatatest))

fitting: 100%|██████████| 25/25 [00:01<00:00, 14.47it/s]
merging: 100%|██████████| 25/25 [00:03<00:00,  5.91it/s]
fitting: 100%|██████████| 25/25 [00:01<00:00, 17.37it/s]
merging: 100%|██████████| 25/25 [00:03<00:00,  5.92it/s]
fitting: 100%|██████████| 25/25 [00:01<00:00, 15.64it/s]
merging: 100%|██████████| 25/25 [00:04<00:00,  5.76it/s]
fitting: 100%|██████████| 25/25 [00:01<00:00, 16.85it/s]
merging: 100%|██████████| 25/25 [00:04<00:00,  5.87it/s]
fitting: 100%|██████████| 25/25 [00:01<00:00, 17.42it/s]
merging: 100%|██████████| 25/25 [00:04<00:00,  4.89it/s]


[0]	train-auc:0.852073	eval-auc:0.85347
[50]	train-auc:0.862788	eval-auc:0.863239
[100]	train-auc:0.866302	eval-auc:0.866433
[150]	train-auc:0.867992	eval-auc:0.867851
[200]	train-auc:0.869234	eval-auc:0.868797
[250]	train-auc:0.870276	eval-auc:0.869521
[300]	train-auc:0.871168	eval-auc:0.870088
[350]	train-auc:0.872054	eval-auc:0.870635
[400]	train-auc:0.872738	eval-auc:0.870982


KeyboardInterrupt: 

In [37]:
preds = np.mean(test_predictions, axis=0)

In [38]:
len(preds)

237813

In [39]:
len(test)

237813

In [40]:
sample_submission = pd.read_csv("McK_SubmissionFormat.csv")
sample_submission.head()

Unnamed: 0,offer_gk,driver_response
0,152446,
1,281031,
2,779964,
3,16720,
4,492087,


In [41]:
sample_submission.driver_response = preds

In [47]:
sample_submission.to_csv("first_submit.csv", index=None)

In [48]:
ss = pd.read_csv("first_submit.csv")
ss.head()

Unnamed: 0,offer_gk,driver_response
0,152446,0.860416
1,281031,0.849151
2,779964,0.929536
3,16720,0.893501
4,492087,0.909175
