In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
data_df = pd.read_csv('/kaggle/input/testove-int20h-uklon-churn/train.csv')
test_df = pd.read_csv('/kaggle/input/testove-int20h-uklon-churn/test.csv')
sample_submition = pd.read_csv('/kaggle/input/testove-int20h-uklon-churn/Samle_Submission.csv')

In [3]:
data_df.info()

In [4]:
for col in ['Week']:
    data_df[col] = data_df[col].astype('category')

In [5]:
from sklearn.impute import KNNImputer

In [None]:
imputer = KNNImputer(missing_values=np.nan)
features_trans = imputer.fit_transform(data_df.drop(columns=['Id', 'target']))

features_trans = pd.DataFrame(features_trans, columns = data_df.columns[1:-1])
features_trans['Id'] = data_df['Id']
features_trans['target'] = data_df['target']
data_df = features_trans



features_trans_test = imputer.transform(test_df.drop(columns=['Id']))

features_trans_test = pd.DataFrame(features_trans_test, columns = test_df.columns[1:])
features_trans_test['Id'] = test_df['Id']
test_df = features_trans_test

In [None]:
# for col in data_df.columns:
#     plt.title(col)
#     plt.hist(data_df[col].dropna().values.reshape(-1,1))
#     plt.show()

In [None]:
#data_df = data_df[~data_df.V1.isna()].reset_index(drop=True)
#data_df.groupby(['Id']).apply(lambda row: len(row['Week'].unique())).unique()

#100*data_df.isna().sum()/data_df.shape[0]


# for col in data_df.columns:
#     plt.title(col)
#     plt.hist(data_df[col].dropna().values.reshape(-1,1))
#     plt.show()


#print('Unique drivers id', len(data_df.Id.unique()))

In [None]:
from sklearn.model_selection import KFold

N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, random_state=42)

unique_drivers_df = pd.DataFrame(data_df.Id.unique(), columns=['id'])
unique_drivers_df['fold'] = -1
for fold, (train_index, test_index) in enumerate(kf.split(unique_drivers_df)):
    unique_drivers_df['fold'].iloc[test_index] = fold
    
    
data_df['fold'] = data_df['Id'].replace(unique_drivers_df.set_index('id').to_dict()['fold'])
data_df['fold'].plot()

In [None]:
!pip install pytorch_tabnet

In [None]:
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier

from sklearn.metrics import auc, roc_auc_score

from pytorch_tabnet.pretraining import TabNetPretrainer
import torch

In [None]:
columns_for_model = data_df.drop(columns=['Id', 'target', 'fold']).columns

metrics = []
cross_validation_list = []
for fold in range(N_FOLDS):
    
    train_df = data_df[data_df.fold != fold]
    valid_df = data_df[data_df.fold == fold]
    
    
    unsupervised_model_no_preproc = TabNetPretrainer(
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-1),
        mask_type='entmax', # "sparsemax",
        #n_shared_decoder=1, # nb shared glu for decoding
        #n_indep_decoder=1, # nb independent glu for decoding
    )

    unsupervised_model_no_preproc.fit(
        train_df[columns_for_model].values,
        eval_set=[valid_df[columns_for_model].values],
        max_epochs=50 , patience=30,
        batch_size=256, virtual_batch_size=128,
        num_workers=0,
        drop_last=False,
        pretraining_ratio=0.8,

    )

    clf1_nopreproc = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                           optimizer_params=dict(lr=2e-2),
                           scheduler_params={"step_size":50, # how to use learning rate scheduler
                                             "gamma":0.9},
                           scheduler_fn=torch.optim.lr_scheduler.StepLR,
                           mask_type='entmax' # "sparsemax"
                          )

    clf1_nopreproc.fit(
        train_df[columns_for_model].values, train_df.target.values,
        eval_set=[(train_df[columns_for_model].values, train_df.target.values), (valid_df[columns_for_model].values, valid_df.target.values)],
        eval_name=['train', 'valid'],
        eval_metric=['auc'],
        max_epochs=1000 , patience=50,
        batch_size=256, virtual_batch_size=128,
        num_workers=0,
        weights=1,
        drop_last=False,
        from_unsupervised=unsupervised_model_no_preproc
    )



    valid_df['pred'] = clf1_nopreproc.predict_proba(valid_df[columns_for_model].values)[:,1]
    fold_metrics = roc_auc_score(valid_df['target'].values, valid_df['pred'].values)
    metrics.append(fold_metrics)
    cross_validation_list.append(valid_df[['Id', 'Week', 'target', 'pred']])
    print('fold',fold,'auc', fold_metrics)

    test_df['pred_nn_'+str(fold)] = clf1_nopreproc.predict_proba(test_df[columns_for_model].values)[:,1]

cross_validation_df = pd.concat(cross_validation_list)
nn_cross_validation_groupby_df = cross_validation_df.groupby(['Id', 'target']).apply(lambda row: row.pred.mean()).reset_index().rename(columns={0:'mean_pred'})

In [23]:
#0.9298453282828283
roc_auc_score(nn_cross_validation_groupby_df.target, nn_cross_validation_groupby_df.mean_pred)

In [18]:
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [19]:
columns_for_model = data_df.drop(columns=['Id', 'target', 'fold']).columns

params = {
        #'boosting_type': 'gbdt',
        'objective': 'cross_entropy',
        'metric': 'auc',
        'num_leaves': 31,
        'learning_rate': 0.01,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        #'verbose': 0
    }

param_stopping = {'early_stopping': 100,
                    'num_boost_round':1000
                       }

metrics = []
cross_validation_list = []
for fold in range(N_FOLDS):
    
    train_df = data_df[data_df.fold != fold]
    valid_df = data_df[data_df.fold == fold]

    lgb_train_dataset = lgb.Dataset(train_df[columns_for_model], train_df.target)
    lgb_valid_dataset = lgb.Dataset(valid_df[columns_for_model], valid_df.target, reference=lgb_train_dataset)


    lgb_model = lgb.train(params,
                    lgb_train_dataset,
                    num_boost_round=param_stopping['num_boost_round'],
                    valid_sets=lgb_valid_dataset,
                    callbacks=[lgb.early_stopping(stopping_rounds=param_stopping['early_stopping'])])

    valid_df['pred'] = lgb_model.predict(valid_df[columns_for_model])
    fold_metrics = roc_auc_score(valid_df['target'].values, valid_df['pred'].values)
    metrics.append(fold_metrics)
    cross_validation_list.append(valid_df[['Id', 'Week', 'target', 'pred']])
    print('fold',fold,'auc', fold_metrics)
    
    test_df['pred_lgb_'+str(fold)] = lgb_model.predict(test_df[columns_for_model])
    
cross_validation_df = pd.concat(cross_validation_list)
lgb_cross_validation_groupby_df = cross_validation_df.groupby(['Id', 'target']).apply(lambda row: row.pred.mean()).reset_index().rename(columns={0:'mean_pred'})

In [20]:

roc_auc_score(lgb_cross_validation_groupby_df.target, lgb_cross_validation_groupby_df.mean_pred)

In [21]:
import xgboost as xgb

In [13]:
columns_for_model = data_df.drop(columns=['Id', 'target', 'fold']).columns

param = { 'objective': 'binary:logistic', 'eval_metric':'auc', 'learning_rate':0.05}

param_stopping = {'early_stopping': 100,
                    'num_boost_round':1000
                       }

metrics = []
cross_validation_list = []
for fold in range(N_FOLDS):
    
    train_df = data_df[data_df.fold != fold]
    valid_df = data_df[data_df.fold == fold]

    xgb_train_dataset = xgb.DMatrix(train_df[columns_for_model], train_df.target, missing=-1)
    xgb_valid_dataset = xgb.DMatrix(valid_df[columns_for_model], valid_df.target, missing=-1)
    xgb_test_dataset = xgb.DMatrix(test_df[columns_for_model], missing=-1)
    
    
    evallist = [(xgb_train_dataset, 'train'), (xgb_valid_dataset, 'eval')]
    
    
    xgb_model = xgb.train(param, xgb_train_dataset, param_stopping['num_boost_round'], evallist, early_stopping_rounds=param_stopping['early_stopping'])
    

    valid_df['pred'] = xgb_model.predict(xgb_valid_dataset)
    fold_metrics = roc_auc_score(valid_df['target'].values, valid_df['pred'].values)
    metrics.append(fold_metrics)
    cross_validation_list.append(valid_df[['Id', 'Week', 'target', 'pred']])
    print('fold',fold,'auc', fold_metrics)
    
    test_df['pred_xgb_'+str(fold)] = xgb_model.predict(xgb_test_dataset)
    
cross_validation_df = pd.concat(cross_validation_list)
xgb_cross_validation_groupby_df = cross_validation_df.groupby(['Id', 'target']).apply(lambda row: row.pred.mean()).reset_index().rename(columns={0:'mean_pred'})

In [14]:

#0.9552711085119306
roc_auc_score(xgb_cross_validation_groupby_df.target, xgb_cross_validation_groupby_df.mean_pred)

In [15]:
all_cross_validation_groupby_df = xgb_cross_validation_groupby_df.copy()
all_cross_validation_groupby_df = all_cross_validation_groupby_df.rename(columns={'mean_pred':'xgb_mean_pred'})
all_cross_validation_groupby_df['lgb_mean_pred'] = lgb_cross_validation_groupby_df['mean_pred']

In [26]:
all_cross_validation_groupby_df.to_csv('lgbm_xgbm_folds20.csv', index=False)

In [17]:
all_cross_validation_groupby_df['mean_all'] = all_cross_validation_groupby_df.iloc[:,2:].mean(1)

In [18]:
#all_cross_validation_groupby_df.mean_all[all_cross_validation_groupby_df.Id.isin(data_df[data_df.V1 == -1].Id.unique())] = 1

In [19]:
#0.9566322363894253
for col in all_cross_validation_groupby_df.iloc[:,2:]:
    print(col)
    print(roc_auc_score(all_cross_validation_groupby_df.target, all_cross_validation_groupby_df[col]))

In [20]:
sub_df = test_df[['Id', 'Week']+[col for col in test_df if ('pred' in col)]]

In [21]:
unique_sub_df = sub_df.groupby(['Id']).apply(lambda row: 
                                 row[[col for col in test_df if ('pred' in col)]].mean(1).mean()).reset_index().rename(columns={0:'Predicted'}
                                                                                                                      )

unique_sub_df

In [22]:
plt.title('train')
plt.hist(data_df.groupby('Id').apply(lambda row: row.target.median()))
plt.show()
plt.title('test')
plt.hist(unique_sub_df.Predicted)
plt.show()


In [23]:
# sample_submition = pd.DataFrame(test_df.groupby('Id').apply(lambda row: row.mean_model.mean()), columns=['Predicted'])
# sample_submition = sample_submition.reset_index()

sample_submition = unique_sub_df
sample_submition

In [24]:
EXPERIMENT_NAME = 'lgb_xgb_np_nan_folds20'

sample_submition.to_csv(EXPERIMENT_NAME+'_sub.csv', index=False)
cross_validation_df.to_csv(EXPERIMENT_NAME+'_cv.csv', index=False)
test_df.drop(columns=columns_for_model[1:]).to_csv(EXPERIMENT_NAME+'_test.csv', index=False)