In [39]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [103]:
data_df = pd.read_csv('/kaggle/input/testove-int20h-uklon-churn/train.csv')
test_df = pd.read_csv('/kaggle/input/testove-int20h-uklon-churn/test.csv')
sample_submition = pd.read_csv('/kaggle/input/testove-int20h-uklon-churn/Samle_Submission.csv')

In [104]:
data_df = data_df.fillna(-1)
test_df = test_df.fillna(-1)

In [105]:
columns_for_model = data_df.drop(columns=['Id', 'target', 'Week']).columns
columns_for_model

In [107]:
one_data_df = pd.DataFrame(data_df[columns_for_model].values.reshape(-1, 49*4))
one_data_df['Id'] = data_df['Id'].values[::4]
one_data_df['target'] = data_df['target'].values[::4]

data_df = one_data_df

In [110]:
test_df

In [113]:
one_test_df = pd.DataFrame(test_df[columns_for_model].values.reshape(-1, 49*4))
one_test_df['Id'] = test_df['Id'].values[::4]
#one_test_df['target'] = test_df['target'].values[::4]

test_df = one_test_df

In [115]:
from sklearn.model_selection import KFold

N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, random_state=42)

unique_drivers_df = pd.DataFrame(data_df.Id.unique(), columns=['id'])
unique_drivers_df['fold'] = -1
for fold, (train_index, test_index) in enumerate(kf.split(unique_drivers_df)):
    unique_drivers_df['fold'].iloc[test_index] = fold
    
    
data_df['fold'] = data_df['Id'].replace(unique_drivers_df.set_index('id').to_dict()['fold'])
data_df['fold'].plot()

In [116]:
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [117]:
from sklearn.metrics import auc, roc_auc_score

In [132]:
columns_for_model = data_df.columns[:-3]

params = {
        #'boosting_type': 'gbdt',
        'objective': 'cross_entropy',
        'metric': 'auc',
        'num_leaves': 31,
        'learning_rate': 0.01,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        #'verbose': 0
    }

param_stopping = {'early_stopping': 100,
                    'num_boost_round':1000
                       }

metrics = []
cross_validation_list = []
for fold in range(N_FOLDS):
    
    train_df = data_df[data_df.fold != fold]
    valid_df = data_df[data_df.fold == fold]

    lgb_train_dataset = lgb.Dataset(train_df[columns_for_model], train_df.target)
    lgb_valid_dataset = lgb.Dataset(valid_df[columns_for_model], valid_df.target, reference=lgb_train_dataset)


    lgb_model = lgb.train(params,
                    lgb_train_dataset,
                    num_boost_round=param_stopping['num_boost_round'],
                    valid_sets=lgb_valid_dataset,
                    callbacks=[lgb.early_stopping(stopping_rounds=param_stopping['early_stopping'])])

    valid_df['pred'] = lgb_model.predict(valid_df[columns_for_model])
    fold_metrics = roc_auc_score(valid_df['target'].values, valid_df['pred'].values)
    metrics.append(fold_metrics)
    cross_validation_list.append(valid_df[['Id', 'target', 'pred']])
    print('fold',fold,'auc', fold_metrics)
    
    test_df['pred_lgb_'+str(fold)] = lgb_model.predict(test_df[columns_for_model])
    
cross_validation_df = pd.concat(cross_validation_list)
lgb_cross_validation_groupby_df = cross_validation_df.copy()#.groupby(['Id', 'target']).apply(lambda row: row.pred.mean()).reset_index().rename(columns={0:'mean_pred'})

In [134]:
lgb_cross_validation_groupby_df

In [135]:

roc_auc_score(lgb_cross_validation_groupby_df.target, lgb_cross_validation_groupby_df.pred)

In [123]:
import xgboost as xgb

In [126]:
#columns_for_model = data_df.drop(columns=['Id', 'target', 'fold']).columns

param = { 'objective': 'binary:logistic', 'eval_metric':'auc', 'learning_rate':0.05}

param_stopping = {'early_stopping': 100,
                    'num_boost_round':1000
                       }

metrics = []
cross_validation_list = []
for fold in range(N_FOLDS):
    
    train_df = data_df[data_df.fold != fold]
    valid_df = data_df[data_df.fold == fold]

    xgb_train_dataset = xgb.DMatrix(train_df[columns_for_model], train_df.target, missing=-1)
    xgb_valid_dataset = xgb.DMatrix(valid_df[columns_for_model], valid_df.target, missing=-1)
    xgb_test_dataset = xgb.DMatrix(test_df[columns_for_model], missing=-1)
    
    
    evallist = [(xgb_train_dataset, 'train'), (xgb_valid_dataset, 'eval')]
    
    
    xgb_model = xgb.train(param, xgb_train_dataset, param_stopping['num_boost_round'], evallist, early_stopping_rounds=param_stopping['early_stopping'])
    

    valid_df['pred'] = xgb_model.predict(xgb_valid_dataset)
    fold_metrics = roc_auc_score(valid_df['target'].values, valid_df['pred'].values)
    metrics.append(fold_metrics)
    cross_validation_list.append(valid_df[['Id', 'target', 'pred']])
    print('fold',fold,'auc', fold_metrics)
    
    test_df['pred_xgb_'+str(fold)] = xgb_model.predict(xgb_test_dataset)
    
cross_validation_df = pd.concat(cross_validation_list)
#xgb_cross_validation_groupby_df = cross_validation_df.groupby(['Id', 'target']).apply(lambda row: row.pred.mean()).reset_index().rename(columns={0:'mean_pred'})

In [130]:
xgb_cross_validation_df = cross_validation_df.copy()

In [129]:

#0.9552711085119306
roc_auc_score(xgb_cross_validation_groupby_df.target, xgb_cross_validation_groupby_df.mean_pred)

In [136]:
xgb_cross_validation_groupby_df

In [137]:
all_cross_validation_groupby_df = xgb_cross_validation_groupby_df.copy()
all_cross_validation_groupby_df = all_cross_validation_groupby_df.rename(columns={'mean_pred':'xgb_mean_pred'})
all_cross_validation_groupby_df['lgb_mean_pred'] = lgb_cross_validation_groupby_df['pred']

In [138]:
#all_cross_validation_groupby_df.to_csv('lgbm_xgbm_folds20.csv', index=False)

In [139]:
all_cross_validation_groupby_df.iloc[:,2:]

In [140]:
all_cross_validation_groupby_df['mean_all'] = all_cross_validation_groupby_df.iloc[:,2:].mean(1)

In [141]:
#all_cross_validation_groupby_df.mean_all[all_cross_validation_groupby_df.Id.isin(data_df[data_df.V1 == -1].Id.unique())] = 1

In [142]:
#0.9580021646626796
for col in all_cross_validation_groupby_df.iloc[:,2:]:
    print(col)
    print(roc_auc_score(all_cross_validation_groupby_df.target, all_cross_validation_groupby_df[col]))

In [144]:
test_df

In [147]:
[col for col in test_df if ('pred_xgb' in str(col))]

In [148]:
sub_df = test_df[['Id']+[col for col in test_df if ('pred_xgb' in str(col))]]

In [155]:
sub_df['Predicted'] = test_df[[col for col in test_df if ('pred_xgb' in str(col))]].mean(1)

In [158]:
sub_df[['Id', 'Predicted']].to_csv('final'+'_sub.csv', index=False)

In [None]:
unique_sub_df = sub_df.groupby(['Id']).apply(lambda row: 
                                 row[[col for col in test_df if ('pred_xgb' in col)]].mean(1).mean()).reset_index().rename(columns={0:'Predicted'}
                                                                                                                      )

unique_sub_df

In [154]:
plt.title('train')
plt.hist(data_df.groupby('Id').apply(lambda row: row.target.median()))
plt.show()
plt.title('test')
plt.hist(unique_sub_df.Predicted)
plt.show()


In [None]:
# sample_submition = pd.DataFrame(test_df.groupby('Id').apply(lambda row: row.mean_model.mean()), columns=['Predicted'])
# sample_submition = sample_submition.reset_index()

sample_submition = unique_sub_df
sample_submition

In [None]:
EXPERIMENT_NAME = 'lgb_knn_folds10'

sample_submition.to_csv(EXPERIMENT_NAME+'_sub.csv', index=False)
cross_validation_df.to_csv(EXPERIMENT_NAME+'_cv.csv', index=False)
test_df.drop(columns=columns_for_model[1:]).to_csv(EXPERIMENT_NAME+'_test.csv', index=False)