In [None]:
#~/Library/Jupyter/nbextensions/snippets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
from tqdm import tqdm_notebook, tqdm
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, mean_absolute_error as mae, mean_squared_error as mse
import xgboost
from xgboost import DMatrix
import catboost
from sklearn.model_selection import StratifiedKFold
from semenov import *
import gc
from sklearn.model_selection import KFold

In [None]:
train = pd.read_csv("CAX_TrainingData_McK.csv", na_values=-1)
train.head()

In [None]:
test = pd.read_csv("CAX_TestData_McK.csv", na_values=-1)
test.head(2)

In [None]:
def ride_type_desc_transform(x):
    if x=="private":
        return 0
    elif x=="business":
        return 1
    else:
        return 2
    
def offer_class_group_transform(x):
    if x=="Economy":
        return 0
    return 1
    
y = train.pop("driver_response")
test.pop("driver_response")

train.drop(["offer_gk", "order_gk"], axis=1, inplace=True)
test.drop(["offer_gk", "order_gk"], axis=1, inplace=True)

test.ride_type_desc = test.ride_type_desc.apply(ride_type_desc_transform)
test.offer_class_group = test.offer_class_group.apply(offer_class_group_transform)

train.ride_type_desc = train.ride_type_desc.apply(ride_type_desc_transform)
train.offer_class_group = train.offer_class_group.apply(offer_class_group_transform)

In [None]:
STAS_CONST = 5

def semenov_prep(X_train, y_train, X_val, features, C=STAS_CONST):
    se = SemenovEncoding(C=C)
    se.fit(X_train, y_train, features=features)
    X_val = se.transform(X_val)
    
    # разбиваю X_train на 3 куска и считаю правильно фичи Стаса.
    X_train_new = pd.DataFrame(index=X_train.index, columns=X_val.columns, dtype=np.float64)
    
    kf = KFold(n_splits=4, shuffle=True, random_state=32578)
    for itr, ite in kf.split(X_train):
        se = SemenovEncoding(C=C)
        se.fit(X_train.iloc[itr], y_train.iloc[itr], features=features)
        X_train_new.iloc[ite] = se.transform(X_train.iloc[ite])
        
    gc.collect()
    return X_train_new, X_val

def semenov_prep_test(X, y, X_test, features, C=STAS_CONST):
    se = SemenovEncoding(C=C)
    se.fit(X, y, features=features)
    X_test = se.transform(X_test)
    X_test.drop(list(set(features) &  set(X_test.columns) - set(["driver_gk"])), axis=1, inplace=True)
    gc.collect()
    return X_test
    
def data_prep(X_train, y_train, X_val, y_val, stas=[]):
    X_train, X_val = semenov_prep(X_train, y_train, X_val, features=stas)
    X_train.drop(list(set(stas) &  set(X_train.columns) - set(["driver_gk"])), axis=1, inplace=True)
    X_val.drop(list(set(stas) &  set(X_val.columns) - set(["driver_gk"])), axis=1, inplace=True)
    gc.collect()
    return X_train, y_train, X_val, y_val

In [None]:
cross_val = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
stas = ["weekday_key", "hour_key", "driver_gk",
        "offer_class_group", "ride_type_desc"]

In [None]:
new_stas = stas[:]

for column_first in tqdm_notebook(stas):
    for column_second in tqdm_notebook(stas):
        if column_first != column_second:
            if column_first + column_second not in train.columns:
                new_col = column_first + "_" + column_second
                train[new_col] = train[column_first].apply(lambda x: str(x) + "_")
                train[new_col] = train[new_col] + train[column_second].apply(lambda x: str(x))
                
                test[new_col] = test[column_first].apply(lambda x: str(x) + "_")
                test[new_col] = test[new_col] + test[column_second].apply(lambda x: str(x))
                new_stas.append(new_col)
                
stas = new_stas

In [None]:
test = semenov_prep_test(train, y, test, features=stas)
test.head(10)

In [None]:
param = {}
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'auc'
param['max_depth'] = 5
param['eta'] = 0.05
param['subsample'] = 0.6
param['colsample_bytree'] = 0.6
param['nthread'] = 15
param['alpha'] = 5
# param['lambda_bias'] = 0.1
param['lambda'] = 5
# param['min_child_weight'] = 5


# early_stopping_rounds = int(1000 / param['eta'] ** 0.5)

In [None]:
bsts = []
train_predictions = np.zeros(len(y))
test_predictions = []

for itr, ite in cross_val.split(train, y):    
    X_train, y_train, X_test, y_test = data_prep(train.iloc[itr], y.iloc[itr],
                                                 train.iloc[ite], y.iloc[ite],
                                                 stas=stas)
    
    Xdatatrain = xgboost.DMatrix(data=X_train, label=y_train)
    Xdataval = xgboost.DMatrix(data=X_test, label=y_test)
    watchlist = [(Xdatatrain, 'train'), (Xdataval, 'eval')]
    bst = xgboost.train(list(param.items()), Xdatatrain, 500, 
                        evals=watchlist, verbose_eval=50)
    
    Xdatatest = xgboost.DMatrix(test)
    
    bsts.append(bst)
    train_predictions[ite] = bst.predict(Xdataval)
    test_predictions.append(bst.predict(Xdatatest))
#     break

In [None]:
preds = np.mean(test_predictions, axis=0)
sample_submission = pd.read_csv("McK_SubmissionFormat.csv")
sample_submission.driver_response = preds
sample_submission.to_csv("second_submit.csv", index=None)