In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import hmean
from tqdm import tqdm_notebook

from util_data import DataSet



In [2]:
def target_encode(trn_series,    
                  tst_series,
                  target,
                  min_samples_leaf=1,
                  smoothing=1):

    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()

    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_trn_series.index = trn_series.index
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_tst_series.index = tst_series.index
    
    return ft_trn_series, ft_tst_series

In [3]:
correlated_features = ["membership_expire_date_last","transaction_date_last"]
lacunar_features = []


In [4]:
data = DataSet()

train = data.get_training_set().append(data.get_validation_set())
test = data.get_testing_set()

categorical_features = ["city","bd","gender", 'registered_via', 'is_auto_renew_median', 'is_auto_renew_last','plan_list_price_mean', 'plan_list_price_last']

In [5]:
combs = [
]

for f1, f2 in combs:
    name1 = f1 + "_plus_" + f2
    train[name1] = train[f1].apply(lambda x: str(x)) + "_" + train[f2].apply(lambda x: str(x))
    test[name1] = test[f1].apply(lambda x: str(x)) + "_" + test[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train[name1].values) + list(test[name1].values))
    train[name1] = lbl.transform(list(train[name1].values))
    test[name1] = lbl.transform(list(test[name1].values))
    train_features.append(name1)

## Test

In [6]:
MAX_ROUNDS = 400
LEARNING_RATE = 0.07

rfc = XGBClassifier(    
        learning_rate=0.02, #use 0.002
        max_depth= 7,
        objective= 'binary:logistic',
                     )


In [7]:
train = train.drop(correlated_features,axis=1).drop(lacunar_features,axis=1)
test = test.drop(correlated_features,axis=1).drop(lacunar_features,axis=1)

train["TimeSinceReg"] = train["TimeSinceReg"].replace("nan","0")
test["TimeSinceReg"] = test["TimeSinceReg"].replace("nan","0")

train["TimeSinceReg"] = train["TimeSinceReg"].apply(lambda chaine : int(chaine.split(" ")[0]))
test["TimeSinceReg"] = test["TimeSinceReg"].apply(lambda chaine : int(chaine.split(" ")[0]))

#train.columns = train.columns.sort_values()
#test.columns = test.columns.sort_values()
test= test.drop(["date_avg_6mo", "date_avg_1mo"],axis=1)
train= train.drop(["date_avg_6mo", "date_avg_1mo"],axis=1)


cols = train.columns.tolist()
cols.remove("is_churn")
test = test[cols]

train = train.fillna(0)
test = test.fillna(0)

X_train, X_test, Y_train = train.drop(["msno","is_churn"],axis=1),test.drop(["msno"],axis=1),train.is_churn

Unnamed: 0,TimeSinceReg,actual_amount_paid_last,actual_amount_paid_mean,bd,city,count_1mo,count_3mo,count_6mo,gender,is_auto_renew_last,...,payment_method_id_last,payment_method_id_mean,payment_plan_days_last,payment_plan_days_mean,plan_list_price_last,plan_list_price_mean,registered_via,total_secs_avg_1mo,total_secs_avg_3mo,total_secs_avg_6mo
238,220,149.0,131.857143,0.0,1.0,25.0,59.0,75.0,0.0,1.0,...,30.0,30.0,30.0,30.000000,149.0,131.857143,9.0,4336.155080,3381.520627,3263.899467
239,206,129.0,129.000000,0.0,1.0,7.0,18.0,44.0,0.0,1.0,...,30.0,30.0,30.0,30.000000,129.0,129.000000,9.0,9998.188571,9158.663778,5566.194523
259,88,149.0,74.500000,0.0,1.0,3.0,4.0,0.0,0.0,1.0,...,30.0,30.0,30.0,30.000000,149.0,149.000000,13.0,2700.657333,3327.177750,0.000000
266,0,129.0,129.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,30.0,30.0,30.0,30.000000,129.0,129.000000,0.0,0.000000,0.000000,0.000000
268,0,100.0,100.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,30.0,30.0,30.0,30.000000,100.0,100.000000,0.0,0.000000,0.000000,0.000000
269,0,100.0,100.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,30.0,30.0,30.0,30.000000,100.0,100.000000,0.0,0.000000,0.000000,0.000000
271,0,129.0,129.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,30.0,30.0,30.0,30.000000,129.0,129.000000,0.0,0.000000,0.000000,0.000000
274,0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,30.0,30.0,30.0,30.000000,149.0,149.000000,0.0,0.000000,0.000000,0.000000
275,0,100.0,100.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,30.0,30.0,30.0,30.000000,100.0,100.000000,0.0,0.000000,0.000000,0.000000
280,0,129.0,129.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,30.0,30.0,30.0,30.000000,129.0,129.000000,0.0,0.000000,0.000000,0.000000


In [None]:
Y_preds = np.zeros(X_test.shape[0])

local_train = X_train
local_test = X_test
    
for f in tqdm_notebook(categorical_features):
    local_train[f + "_avg"],local_test[f + "_avg"] = target_encode(
                                                            trn_series=local_train[f],
                                                            tst_series=local_test[f],
                                                            target=Y_train,
                                                            min_samples_leaf=200,
                                                            smoothing=10,
                                                            )
local_train = local_train.drop(categorical_features,axis=1)
local_test = local_test.drop(categorical_features,axis=1)

    
fitrfc = rfc.fit(local_train,Y_train,verbose=True)
Y_preds = fitrfc.predict_proba(local_test,axis=1)[:,1]
        
del local_train, local_test





In [None]:
test['is_churn'] = Y_preds.clip(0.+1e-15, 1-1e-15)

In [None]:
test[['msno','is_churn']].to_csv('submission.csv', index=False)