### Import Libraries

In [62]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
import warnings
warnings.filterwarnings("ignore")

### Load Data

In [63]:
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')
df_sub=pd.read_csv('sample_submission.csv')
df_org=pd.read_csv('Churn_Modelling.csv')

In [64]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scale_cols = ['Age','CreditScore', 'Balance','EstimatedSalary']
for col in scale_cols:
    df_train[col+"_scaled"]=scaler.fit_transform(df_train[col].values.reshape(-1,1)) # (-1,1) is for reshaping the array to 2D
    df_test[col+"_scaled"]=scaler.transform(df_test[col].values.reshape(-1,1))

In [65]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
CustomerId,15674932,15749177,15694510,15741417,15766172
Surname,Okwudilichukwu,Okwudiliolisa,Hsueh,Kao,Chiemenam
CreditScore,668,627,678,581,716
Geography,France,France,France,France,Spain
Gender,Male,Male,Male,Male,Male
Age,33.0,33.0,40.0,34.0,33.0
Tenure,3,1,10,2,5
Balance,0.0,0.0,0.0,148882.54,0.0
NumOfProducts,2,2,2,1,2


### Feature Engineering

In [66]:
def get_Feats(df):
    df["IsSenior"]=df["Age"].apply(lambda x:1 if x>=60 else 0)
    df["IsActive_by_CC"]=df["HasCrCard"]*df["IsActiveMember"]
    df["Products_per_Tenure"]=df["Tenure"]/df["NumOfProducts"]
    df["Age_cat"]=np.round(df["Age"]/20).astype('int').astype('category')
    df['Sur_Geo_Gend_Sal'] = df['Surname']+df['Geography']+df['Gender']+np.round(df.EstimatedSalary).astype('str')
    return df

In [67]:
df_train=get_Feats(df_train)
df_test=get_Feats(df_test)

features=df_train.columns.drop('id','Exited').drop(scale_cols)

features

Index(['CustomerId', 'Surname', 'Geography', 'Gender', 'Tenure',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Exited', 'Age_scaled',
       'CreditScore_scaled', 'Balance_scaled', 'EstimatedSalary_scaled',
       'IsSenior', 'IsActive_by_CC', 'Products_per_Tenure', 'Age_cat',
       'Sur_Geo_Gend_Sal'],
      dtype='object')

In [68]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,165034.0,82516.5,47641.3565,0.0,41258.25,82516.5,123774.8,165033.0
CustomerId,165034.0,15692010.0,71397.816791,15565701.0,15633140.0,15690170.0,15756820.0,15815690.0
CreditScore,165034.0,656.4544,80.10334,350.0,597.0,659.0,710.0,850.0
Age,165034.0,38.12589,8.867205,18.0,32.0,37.0,42.0,92.0
Tenure,165034.0,5.020353,2.806159,0.0,3.0,5.0,7.0,10.0
Balance,165034.0,55478.09,62817.663278,0.0,0.0,0.0,119939.5,250898.09
NumOfProducts,165034.0,1.554455,0.547154,1.0,1.0,2.0,2.0,4.0
HasCrCard,165034.0,0.7539537,0.430707,0.0,1.0,1.0,1.0,1.0
IsActiveMember,165034.0,0.4977702,0.499997,0.0,0.0,0.0,1.0,1.0
EstimatedSalary,165034.0,112574.8,50292.865585,11.58,74637.57,117948.0,155152.5,199992.48


In [75]:
X=df_train[features]
y=df_train['Exited']

cat_features=np.where(X.dtypes!=np.float64)[0]
cat_features

array([ 0,  1,  2,  3,  4,  5,  8, 13, 16, 17], dtype=int64)

### Training