# Import

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Data Exploration
- What is credit score ?
- Remove the name 

In [2]:
train_set = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
test_set  = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')
train_set = train_set[['id', 'CustomerId', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited']]


# One hot encoding and Standard Scaler
- When to use standard scalar and when to use minmax?
    - not normally bell shaped is minmax
    - normally bell shape is standard scaler 
- One hot encode the Country and Gender 

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [4]:
# One-Hot Encode 'Gender' and 'Geography' separately
encoded_gender = pd.get_dummies(train_set['Gender'], prefix='Gender').astype(int)
encoded_geography = pd.get_dummies(train_set['Geography'], prefix='Geography').astype(int)

# Concatenate the encoded DataFrames with the original DataFrame
train_set = pd.concat([train_set, encoded_gender, encoded_geography], axis=1)

# Drop the original 'Gender' and 'Geography' columns
train_set = train_set.drop(columns=['Gender', 'Geography'])

# One-Hot Encode 'Gender' and 'Geography' separately
encoded_gender = pd.get_dummies(test_set['Gender'], prefix='Gender').astype(int)
encoded_geography = pd.get_dummies(test_set['Geography'], prefix='Geography').astype(int)

# Concatenate the encoded DataFrames with the original DataFrame
test_set = pd.concat([test_set, encoded_gender, encoded_geography], axis=1)

# Drop the original 'Gender' and 'Geography' columns
test_set = test_set.drop(columns=['Gender', 'Geography'])



In [5]:
from sklearn.preprocessing import StandardScaler

# Features to scale
features_to_scale = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']

# Create a copy of the DataFrame to avoid modifying the original data
scaled_train_set = train_set.copy()

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the selected features
scaled_features = scaler.fit_transform(train_set[features_to_scale])

# Create a DataFrame with the scaled features
scaled_features_df = pd.DataFrame(scaled_features, columns=features_to_scale)

# Replace the original features with the scaled features in the copied DataFrame
scaled_train_set[features_to_scale] = scaled_features_df

# Create a copy of the DataFrame to avoid modifying the original data
scaled_test_set = test_set.copy()

# Fit and transform the selected features
scaled_features = scaler.fit_transform(test_set[features_to_scale])

# Create a DataFrame with the scaled features
scaled_features_df = pd.DataFrame(scaled_features, columns=features_to_scale)

# Replace the original features with the scaled features in the copied DataFrame
scaled_test_set[features_to_scale] = scaled_features_df


# Base Line Modeling XGB

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Assuming 'scaled_train_set' is your DataFrame with the scaled features
# If you haven't scaled the features yet, you can use 'train_set' instead of 'scaled_train_set'

# Features and target variable
X = scaled_train_set[['CreditScore', 'Age', 'Tenure', 'Balance','NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary','Gender_Female', 'Gender_Male', 'Geography_France','Geography_Germany', 'Geography_Spain']]
y = scaled_train_set['Exited']  # Replace 'Target_Column' with the actual target column name

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Xtrain = X_train.copy()
Xtest = X_test.copy()

# Initialize the XGBoost Classifier
model = XGBClassifier()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.87


# Ensemble stacking 

In [7]:
from sklearn.model_selection import KFold
# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [8]:
# Some useful parameters which will come in handy later on
ntrain = X_train.shape[0]
ntest = X_test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5
kf = KFold(n_splits=NFOLDS, random_state=42, shuffle=True)


# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [9]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [10]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [12]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [13]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [14]:
X_train = X_train.values
X_test = X_test.values
y_train = y_train.ravel()

In [16]:
def get_oof(clf, x_train, y_train, x_test, kf):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train, oof_test


In [24]:
scaled_test_set = scaled_test_set[['CreditScore', 'Age', 'Tenure',
       'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Gender_Female', 'Gender_Male', 'Geography_France',
       'Geography_Germany', 'Geography_Spain']]

In [26]:
scaled_test_set = scaled_test_set.values 

In [29]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, X_train, y_train, scaled_test_set,kf) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,X_train, y_train, scaled_test_set,kf) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, X_train, y_train, scaled_test_set,kf) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,X_train, y_train, scaled_test_set,kf) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,X_train, y_train, scaled_test_set,kf) # Support Vector Classifier

print("Training is complete")

Training is complete


In [31]:
et_oof_train = et_oof_train.reshape(-1, 1)
rf_oof_train = rf_oof_train.reshape(-1, 1)
ada_oof_train = ada_oof_train.reshape(-1, 1)
gb_oof_train =gb_oof_train.reshape(-1, 1)
svc_oof_train =svc_oof_train.reshape(-1, 1)


et_oof_test = et_oof_test.reshape(-1, 1)
rf_oof_test = rf_oof_test.reshape(-1, 1)
ada_oof_test = ada_oof_test.reshape(-1, 1)
gb_oof_test =gb_oof_test.reshape(-1, 1)
svc_oof_test =svc_oof_test.reshape(-1, 1)

In [32]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

In [33]:
gbm = XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)

In [47]:
submission = test_set.copy()
submission['Exited'] = predictions

submission = submission[['id','Exited']]
submission.to_csv('submission.csv')

In [46]:
submission

Unnamed: 0,id,Exited
0,165034,0
1,165035,1
2,165036,0
3,165037,0
4,165038,0
...,...,...
110018,275052,0
110019,275053,0
110020,275054,0
110021,275055,0
