In [43]:
from sklearn.model_selection import ShuffleSplit
import gc
import numpy as np
import pandas as pd
import xgboost as xgb

train_df = pd.read_csv(
    './data/train.csv',
    dtype={
        'msno': str,
        'is_churn': bool,
    })
members_df = pd.read_csv(
    './data/members.csv',
    dtype={
     'registered_via' : np.uint8,
     'gender' : 'category',
    },
)
train_df = pd.merge(train_df, members_df, how='left', on='msno')
del members_df
train_df.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,expiration_date
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,18.0,36.0,female,9.0,20050406.0,20170907.0
1,QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1,10.0,38.0,male,9.0,20050407.0,20170321.0
2,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,11.0,27.0,female,9.0,20051016.0,20170203.0
3,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1,13.0,23.0,female,9.0,20051102.0,20170926.0
4,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1,3.0,27.0,male,9.0,20051228.0,20170927.0


In [47]:
transactions_df = pd.read_csv(
    './data/transactions.csv',
    dtype={
        'payment_method': np.uint8,
        'payment_plan_days': np.uint8,
        'plan_list_price': np.uint8,
        'actual_amount_paid': np.uint8,
        'is_auto_renew': np.uint8,
        'is_cancel': np.bool,
    },
)
transactions_df = pd.merge(train_df, transactions_df, how='left', on='msno')
grouped = transactions_df.copy().groupby('msno')

stats_df = grouped.agg({
    'msno': {'total_order': 'count'},
    'plan_list_price': {'plan_net_worth': 'sum'},
    'actual_amount_paid': {
        'mean_payment_each_transaction': 'mean',
        'total_actual_payment': 'sum',
    },
    'is_cancel': {'cancel_times': lambda x: sum(x == 1)},
})
stats_df.columns = stats_df.columns.droplevel(0)
stats_df.reset_index(inplace=True)
train_df = pd.merge(train_df, stats_df, how='left', on='msno')

del transactions_df, stats_df

# In the description the bd column is said to be in a very wide range 
#So I decided to clip it just to store it as a smaller type
train_df['bd'].clip(0,100) 
train_df['bd'].fillna(0,inplace=True)
train_df['bd'].astype(np.uint8,inplace=True)

train_df.head()

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,expiration_date,plan_net_worth_x,cancel_times_x,...,plan_net_worth_y,cancel_times_y,total_actual_payment_y,mean_payment_each_transaction_y,total_order_y,plan_net_worth,cancel_times,total_actual_payment,mean_payment_each_transaction,total_order
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,18.0,36.0,female,9.0,20050406.0,20170907.0,149.0,0,...,149.0,0,149.0,74.5,2,149.0,0,149.0,74.5,2
1,QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1,10.0,38.0,male,9.0,20050407.0,20170321.0,3309.0,2,...,3309.0,2,3458.0,150.347826,23,3309.0,2,3458.0,150.347826,23
2,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,11.0,27.0,female,9.0,20051016.0,20170203.0,1492.0,1,...,1492.0,1,1492.0,149.2,10,1492.0,1,1492.0,149.2,10
3,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1,13.0,23.0,female,9.0,20051102.0,20170926.0,252.0,0,...,252.0,0,252.0,126.0,2,252.0,0,252.0,126.0,2
4,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1,3.0,27.0,male,9.0,20051228.0,20170927.0,1272.0,0,...,1272.0,0,1272.0,159.0,8,1272.0,0,1272.0,159.0,8


In [53]:
del train_df['cancel_times_x'], train_df['cancel_times_y'], train_df['plan_net_worth_x'], train_df['plan_net_worth_y'], train_df['total_actual_payment_x'], train_df['total_actual_payment_y'], train_df['mean_payment_each_transaction_x'], train_df['mean_payment_each_transaction_y']

In [55]:
del train_df['total_order_x'], train_df['total_order_y']
train_df.columns

Index(['msno', 'is_churn', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'expiration_date', 'plan_net_worth',
       'cancel_times', 'total_actual_payment', 'mean_payment_each_transaction',
       'total_order'],
      dtype='object')

In [60]:
# dTrain = xgb.DMatrix(train_df)
params = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 2

Y = train_df.is_churn
del train_df['msno']
del train_df['is_churn']
# print ('running cross validation')
# # do cross validation, this will print result out as
# # [iteration]  metric_name:mean_value+std_value
# # std_value is standard deviation of the metric
# xgb.cv(param, dtrain, num_round, nfold=5,
#        metrics={'error'}, seed = 0,
#        callbacks=[xgb.callback.print_evaluation(show_stdv=True)])

In [65]:
train_df.describe()
del train_df['gender']

In [69]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train_df = train_df.fillna(0)

In [71]:
lr.fit(train_df, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [73]:
lr.coef_

array([[  3.76399629e-07,   9.50288542e-07,  -1.21944718e-07,
         -7.37131402e-06,   7.28340117e-06,  -3.10552042e-04,
          7.63793333e-08,  -3.24074521e-04,  -1.49364506e-05,
         -2.94370845e-06]])