In [None]:
#loading libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import xgboost as xgb
from collections import Counter
from tqdm.notebook import tqdm as tn
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

sns.set(style='white', context='notebook', palette='deep')

## LOAD DATA

In [None]:
train = pd.read_csv("../input/recognizance-1/client_train.csv")
train_inv = pd.read_csv('../input/recognizance-1/train_invoice.csv')
test = pd.read_csv("../input/recognizance-1/client_test.csv")
test_inv = pd.read_csv('../input/recognizance-1/test_invoice.csv')
sample = pd.read_csv('../input/recognizance-1/sample_submission.csv')
IDtest = test["client_id"]
#sample.to_csv("sample_submission.csv",index=False)

In [None]:
print('length of train csv - ',train.shape)
print('length of test csv - ',test.shape)
print('length of train info csv - ',train_inv.shape)
print('length of test info csv - ',test_inv.shape)

## FEATURE ENGINEERING AND PREPROCESSING

In [None]:
def feature_change(cl, inv):

    cl['client_catg'] = cl['client_catg'].astype('category')
    cl['disrict'] = cl['disrict'].astype('category')
    cl['region'] = cl['region'].astype('category')
    cl['region_group'] = cl['region'].apply(lambda x: 100 if x<100 else 300 if x>300 else 200)
    cl['creation_date'] = pd.to_datetime(cl['creation_date'])
    
    cl['coop_time'] = (2021 - cl['creation_date'].dt.year)*12 - cl['creation_date'].dt.month

    inv['counter_type'] = inv['counter_type'].map({"ELEC":1,"GAZ":0})
    inv['counter_statue'] = inv['counter_statue'].map({0:0,1:1,2:2,3:3,4:4,5:5,769:5,'0':0,'5':5,'1':1,'4':4,'A':0,618:5,269375:5,46:5,420:5})
    
    inv['invoice_date'] = pd.to_datetime(inv['invoice_date'], dayfirst=True)
    inv['invoice_month'] = inv['invoice_date'].dt.month
    inv['invoice_year'] = inv['invoice_date'].dt.year
    inv['is_weekday'] = ((pd.DatetimeIndex(inv.invoice_date).dayofweek) // 5 == 1).astype(float)
    inv['delta_index'] = inv['new_index'] - inv['old_index'] # used as a changed variable
    
    return cl, inv

In [None]:
client_train1, invoice_train1 = feature_change(train, train_inv)
client_test1, invoice_test1 = feature_change(test, test_inv)

In [None]:
# aggregated invoice columns on basis of ids to merge with train data
def agg_feature(invoice, client_df, agg_stat):
    
    invoice['delta_time'] = invoice.sort_values(['client_id','invoice_date']).groupby('client_id')['invoice_date'].diff().dt.days.reset_index(drop=True)
    agg_trans = invoice.groupby('client_id')[agg_stat+['delta_time']].agg(['mean','std','min','max'])
    
    agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)

    df = invoice.groupby('client_id').size().reset_index(name='transactions_count')
    agg_trans = pd.merge(df, agg_trans, on='client_id', how='left')
    
    weekday_avg = invoice.groupby('client_id')[['is_weekday']].agg(['mean', 'std'])
    weekday_avg.columns = ['_'.join(col).strip() for col in weekday_avg.columns.values]
    weekday_avg.reset_index(inplace=True)
    client_df = pd.merge(client_df, weekday_avg, on='client_id', how='left')
    
    full_df = pd.merge(client_df, agg_trans, on='client_id', how='left')
    
    full_df['invoice_per_cooperation'] = full_df['transactions_count'] / full_df['coop_time']
    
    return full_df

In [None]:
agg_stat_columns = [
 'tarif_type',
 'counter_number',
 'counter_statue',
 'counter_code',
 'reading_remarque',
 'consommation_level_1',
 'consommation_level_2',
 'consommation_level_3',
 'consommation_level_4',
 'old_index',
 'new_index',
 'months_number',
 'counter_type',
 'invoice_month',
 'invoice_year',
 'delta_index'
]

train_df1 = agg_feature(invoice_train1, client_train1, agg_stat_columns)
test_df1 = agg_feature(invoice_test1, client_test1, agg_stat_columns)
#data1 = agg_feature(invoice_data1, client_data1, agg_stat_columns)

In [None]:
def new_features(df):
    
    for col in agg_stat_columns:
        df[col+'_range'] = df[col+'_max'] - df[col+'_min']
        df[col+'_max_mean'] = df[col+'_max']/df[col+'_mean']
       # df[col+'_min_mean'] = df[col+'_min']/df[col+'_mean']
        df[col+'_std_mean'] = df[col+'_mean']/df[col+'_std']
    return df

In [None]:
train_df2 = new_features(train_df1)
test_df2 = new_features(test_df1)
#data2 = new_features(data1)

In [None]:
def drop(df):

    col_drop = ['client_id', 'creation_date']
    for col in col_drop:
        df.drop([col], axis=1, inplace=True)
    return df

In [None]:
train_df = drop(train_df2)
test_df = drop(test_df2)
#data_df = drop(data2)

In [None]:
y_train = train_df2['target']
x_train = train_df2.drop('target',axis=1)
#test_df = test_df2
feature_name = x_train.columns.tolist()

In [None]:
drop_col=['reading_remarque_max','counter_statue_min','counter_type_min','counter_type_max','counter_type_range',
          'tarif_type_max', 'delta_index_min', 'consommation_level_4_mean']

x_train = x_train.drop(drop_col, axis=1)
test_df = test_df.drop(drop_col, axis=1)
#data_df = data_df.drop(drop_col, axis=1)
#x_train = data_df[0:len(train)]
#test_df = data_df[len(train):]

## Hyperparameter selection

USED OPTUNA FOR SELECTING BEST HYPERPARAMETERS FOR **LGBM(Light Gradient Boosting Model)**

In [None]:
from optuna import Trial
import gc
import optuna
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

#category_cols = ['disrict', 'client_catg', 'region', 'region_group']

def objective(trial:Trial):
    
    gc.collect()
    models=[]
    validScore=0
   
    model,log = fitLGBM(trial,x_train,y_train)
    
    models.append(model)
    gc.collect()
    validScore+=log
    validScore/=len(models)
    
    return validScore

In [None]:
def fitLGBM(trial,X, y):
    
    params={
      'n_estimators':trial.suggest_int('n_estimators', 0, 1000), 
      'num_leaves':trial.suggest_int('num_leaves', 2, 512),
      'max_depth':trial.suggest_int('max_depth', 2, 128),
      'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.15),
      'min_split_gain': trial.suggest_loguniform('min_split_gain', 0.001, 0.1),
      'feature_fraction':trial.suggest_uniform('feature_fraction',0.1, 1.0),
      'bagging_freq':trial.suggest_int('bagging_freq',0.1,10),
      'verbosity': -1,
      'random_state':2021
            }
    stkfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=2021)
    model = LGBMClassifier(**params)
    
    res=[]
    for i, (tdx, vdx) in enumerate(stkfold.split(X, y)):
        X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]
        model.fit(X_train, y_train,
                 eval_set=[(X_train, y_train), (X_valid, y_valid)],
                 early_stopping_rounds=30, verbose=False)
        preds = model.predict_proba(X_valid)
        false_positive_rate, true_positive_rate, thresholds = roc_curve(y_valid, preds[:,1])
        # may differ
        print (auc(false_positive_rate, true_positive_rate))
        res.append(auc(false_positive_rate, true_positive_rate))
    err = np.mean(res)
    
    return model, err

In [None]:
#study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
#study.optimize(objective,None , timeout=60*60*2)

## Training and prediction

Used **Stratified K-fold cross validation** on a single LGBM model to get the accuracy we get on private leaderboard

In [None]:


model = LGBMClassifier(random_state=2021, n_estimators=5000,num_leaves=454, max_depth=60, min_child_samples= 200,
                       learning_rate=0.006910869038433314, min_split_gain=0.00667926424629105,
                       feature_fraction=0.3764303138879782, bagging_freq=9,bagging_seed = 9, boosting_type = "gbdt", metric = 'auc')
                   #   reg_alpha = 0.3,reg_lambda = 0.3)

stkfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)

def calc(X, y, model, cv):
    res=[]
    local_probs=pd.DataFrame()
    probs = pd.DataFrame()

    for i, (tdx, vdx) in tn(enumerate(cv.split(X, y))):
        X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]
      #  X_train, y_train = smote.fit_resample(X_train, y_train)
        model.fit(X_train, y_train,
                 eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  
                 early_stopping_rounds=300, verbose=0)
        
        preds = model.predict_proba(X_valid)
       # preds1 = np.array(list(map(lambda x: 1 if x>0.5 else x, preds[:,1])))
       # print(preds1.shape)
        oof_predict = model.predict_proba(test_df)
        local_probs['fold_%i'%i] = oof_predict[:,1]
        false_positive_rate, true_positive_rate, thresholds = roc_curve(y_valid, preds[:,1])
        # may differ
        print (auc(false_positive_rate, true_positive_rate))
        res.append(auc(false_positive_rate, true_positive_rate))

    print('ROC AUC:', round(np.mean(res), 6))    
    local_probs['res'] = local_probs.mean(axis=1)
    probs['target'] = local_probs['res']
    return probs

In [None]:
%%time
probs2 = calc(x1_train, y1_train, model, stkfold)

In [None]:
# submission
submission = pd.DataFrame({
        "client_id": sample["client_id"],
        "target": 0.08*probs['target'] + 0.07*probs2['target']+0.85*probs1['target']
    })
submission.to_csv('LGBM_with_stacks.csv', index=False)