# Credit Card Default Risk - Prediction

## Import Libraries and Data

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, plot_roc_curve, f1_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow import keras

In [None]:
# import warings
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv('../input/credit-card-default-dataset/dataset/train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv('../input/credit-card-default-dataset/dataset/test.csv')
test_data.head()

### Inspect data and treat data

In [None]:
# check shape and length
print(train_data.shape)
print(len(train_data))

In [None]:
print(test_data.shape)
print(len(test_data))

In [None]:
train_data.info()

In [None]:
# check for null values
train_data.isna().mean()*100

In [None]:
train_data.isna().sum()

Check for data imbalance

In [None]:
# check data imbalance
train_data.credit_card_default.value_counts(normalize=True)*100

In [None]:
train_data.head()

In [None]:
# get all categorical features
cols = train_data.columns 

for i in cols:
    if len(train_data[i].value_counts())<=20:
        print("Column Name:",i)
        print(train_data[i].value_counts())
        print("*"*50)

As there are missing values mostly less than 2%, so, let us impute all missing values with statistical way.
<br>
That is, FillNA by MODE if it is categorical variable and FillNA by MEDIAN if it is numerical variable

In [None]:
train_data.isna().sum().sort_values(ascending=False).head(10)

In [None]:
train_data[train_data['gender'] == 'XNA']

In [None]:
# change it to M as the name pronunces as Male
train_data['gender'] = train_data['gender'].apply(lambda x: 'M' if x=='XNA' else x)
train_data['gender'].value_counts()

impute <i>no_of_children</i> using mode of the data

In [None]:
# fill no_of_children with mode of the data
train_data['no_of_children'].fillna(train_data['no_of_children'].mode()[0], inplace=True)

impute <i>owns_car</i> using mode of the data

In [None]:
# fill owns_car with mode of the data
train_data['owns_car'].fillna(train_data['owns_car'].mode()[0], inplace=True)

impute <i>no_of_days_employed</i> using Median based on <i>occupation_type</i> of the data

In [None]:
# impute 'no_of_days_employed' with median based on 'occupation_type'
occp_days_employed = train_data.groupby(by='occupation_type')['no_of_days_employed'].median()
occp_days_employed

In [None]:
occp_days_employed['Accountants']

In [None]:
# impute 'no_of_days_employed' with median based on 'occupation_type'
for i in occp_days_employed.index:
    train_data['no_of_days_employed'] = np.where((train_data['no_of_days_employed'].isna()) & (train_data['occupation_type'] == i), occp_days_employed[i], train_data['no_of_days_employed'])

In [None]:
train_data.no_of_days_employed.isna().sum()

In [None]:
train_data.head()

In [None]:
# Lets create bins for net_yearly_income
# boxplot of net_yearly_income

train_data.net_yearly_income.plot.box()
plt.show()

In [None]:
train_data.net_yearly_income.quantile([0.25, 0.5, 0.75, 0.99, 1.00])

In [None]:
train_data[train_data.net_yearly_income > train_data.net_yearly_income.quantile(0.99)]

In [None]:
# Lets drop the data above 99%ile 
train_data = train_data[train_data.net_yearly_income <= train_data.net_yearly_income.quantile(0.99)]
train_data.head()

In [None]:
train_data.net_yearly_income.plot.box()
plt.show()

In [None]:
train_data.net_yearly_income.quantile([0.25, 0.5, 0.75, 0.99, 1.00])

In [None]:
# binning of net_yearly_income into categories
def income_binning(x):
    if x <= 125824.5050:
        return 'Low'
    elif x > 125824.5050 and x <= 170968.0300:
        return 'Below Avg'
    elif x > 170968.0300 and x <= 237768.9200:
        return 'Above Avg'
    else:
        return 'High'


In [None]:
train_data['income_categories'] = train_data.net_yearly_income.apply(income_binning)

In [None]:
train_data.head()

In [None]:
# 

impute <i>yearly_debt_payments</i> using Median based on <i>credit_card_default</i> of the data

In [None]:
# impute 'yearly_debt_payments' with median based on 'credit_card_default'
yearDebt_credit = train_data.groupby(by='credit_card_default')['yearly_debt_payments'].median()
yearDebt_credit

In [None]:
# impute 'yearly_debt_payments' with median based on 'credit_card_default'
for i in yearDebt_credit.index:
    train_data['yearly_debt_payments'] = np.where((train_data['yearly_debt_payments'].isna()) & (train_data['credit_card_default'] == i), yearDebt_credit[i], train_data['yearly_debt_payments'])

In [None]:
train_data['yearly_debt_payments'].isna().sum()

Let's check any more null values pending in features

In [None]:
train_data.isna().sum().sort_values(ascending=False).head(10)

impute <i>migrant_worker</i> using Mode of the data

In [None]:
train_data['migrant_worker'].mode()[0]

In [None]:
# imputing with mode of the data
train_data['migrant_worker'].fillna(train_data['migrant_worker'].mode()[0], inplace=True)

# check null values count
train_data['migrant_worker'].isna().sum()

impute <i>total_family_members</i> using Mode of the data

In [None]:
train_data['total_family_members'].mode()[0]

In [None]:
# imputing with mode of the data
train_data['total_family_members'].fillna(train_data['total_family_members'].mode()[0], inplace=True)

# check null values count
train_data['total_family_members'].isna().sum()

impute <i>credit_score</i> using Median based on <i>credit_card_default</i> of the data

In [None]:
# check medians of credit_score wrt credit_card_default
cred_score = train_data.groupby(by='credit_card_default')['credit_score'].median()
cred_score

In [None]:
# impute 'credit_score' with median based on 'credit_card_default'
for i in cred_score.index:
    train_data['credit_score'] = np.where((train_data['credit_score'].isna()) & (train_data['credit_card_default'] == i), cred_score[i], train_data['credit_score'])

# check for null values in 'credit_score'
train_data['credit_score'].isna().sum()

**Check for Null values in the Data Set**

In [None]:
train_data.isna().mean()*100

## EDA

In [None]:
train_data.head(10)

In [None]:
# Ploting Credit_default ratio
total_default_values = train_data.credit_card_default.value_counts(ascending=True).values

plt.figure()
plt.pie(total_default_values, labels=['Defaulter', 'Not Defaulter'], autopct='%1.1f%%', shadow=True)
plt.title('Defaulter Ratio', fontsize=18)
plt.show()

In [None]:
# visualize AGE feature - distplot
plt.figure()
sns.distplot(train_data['age'])
plt.show()

In [None]:
# boxplot of credit_defaulter-credit_score
plt.figure(figsize=[7,6])
sns.boxplot(train_data['credit_card_default'], train_data['credit_score'])
plt.show()

In [None]:
# boxplot of credit_defaulter-credit_limit_utilization
plt.figure(figsize=[7,6])
sns.boxplot(train_data['credit_card_default'], train_data['credit_limit_used(%)'])
plt.show()

In [None]:
# barplots for occupation types wrt credit_default
plt.figure(figsize=[18,8])
sns.barplot(x=train_data['occupation_type'], y=train_data['net_yearly_income'], hue=train_data['credit_card_default'])
plt.xticks(rotation=45)
plt.show()

In [None]:
# male or female defaulters
plt.figure(figsize=[7,6])
sns.countplot(train_data['gender'], hue=train_data['credit_card_default'])
plt.show()

**Check outliers for continuous valued features**

In [None]:
train_data.head()

In [None]:
# boxplot of all continuous variables

plt.figure(figsize=[16, 12])
plt.subplot(2,3,1)
train_data['net_yearly_income'].plot.box()
plt.title('net_yearly_income Box Plot')

plt.subplot(2,3,2)
train_data['no_of_days_employed'].plot.box()
plt.title('no_of_days_employed Box Plot')

plt.subplot(2,3,3)
train_data['yearly_debt_payments'].plot.box()
plt.title('yearly_debt_payments Box Plot')

plt.subplot(2,3,4)
train_data['credit_limit'].plot.box()
plt.title('credit_limit Box Plot')

plt.subplot(2,3,5)
train_data['credit_score'].plot.box()
plt.title('credit_score Box Plot')
plt.show()

Check and treat outliers for 'net_yearly_income'

In [None]:
# net_yearly_income distribution plot
sns.distplot(train_data['net_yearly_income'])
plt.show()

In [None]:
train_data['net_yearly_income'].quantile([0.25, 0.5, 0.75, 0.99, 0.998, 1.0])

In [None]:
train_data[train_data['net_yearly_income']>train_data['net_yearly_income'].quantile(0.998)]

Remove the outlier that are more than 99.8% quantile or net_yearly_income

In [None]:
train_data = train_data[train_data['net_yearly_income']<=train_data['net_yearly_income'].quantile(0.998)]

In [None]:
# net_yearly_income distribution plot
sns.distplot(train_data['net_yearly_income'])
plt.show()

In [None]:
# boxplot of all continuous variables

plt.figure(figsize=[16, 12])
plt.subplot(2,3,1)
train_data['net_yearly_income'].plot.box()
plt.title('net_yearly_income Box Plot')

plt.subplot(2,3,2)
train_data['no_of_days_employed'].plot.box()
plt.title('no_of_days_employed Box Plot')

plt.subplot(2,3,3)
train_data['yearly_debt_payments'].plot.box()
plt.title('yearly_debt_payments Box Plot')

plt.subplot(2,3,4)
train_data['credit_limit'].plot.box()
plt.title('credit_limit Box Plot')

plt.subplot(2,3,5)
train_data['credit_score'].plot.box()
plt.title('credit_score Box Plot')
plt.show()

Check and treat outliers for 'no_of_days_employed'

In [None]:
# no_of_days_employed distribution plot
sns.distplot(train_data['no_of_days_employed'])
plt.show()

In [None]:
train_data['no_of_days_employed'].quantile([0.25, 0.5, 0.75, 0.8185, 0.99, 1.0])

From the above no_of_days_employed feature, there are vales like 365252 days. Which means around 1000 years and makes no sense.
<br>
So removing those outliers also.

In [None]:
# check data greater than 81.85%ile of no_of_days_employed feature
new_trainTEST_data = train_data[train_data['no_of_days_employed']>train_data['no_of_days_employed'].quantile(0.8185)]
new_trainTEST_data

In [None]:
new_trainTEST_data.credit_card_default.value_counts(normalize=True)*100

#### Check for anomalies in the new data set we have taken out

In [None]:
# get all categorical features of the new dataset
cols = train_data.columns 

for i in cols:
    if len(new_trainTEST_data[i].value_counts())<=20:
        print("Column Name:",i)
        print(new_trainTEST_data[i].value_counts())
        print("*"*50)

In [None]:
new_trainTEST_data_final = train_data[train_data['no_of_days_employed']<=train_data['no_of_days_employed'].quantile(0.8185)]

In [None]:
new_trainTEST_data_final.credit_card_default.value_counts(normalize=True)*100

**Let us remove those 20% of the outliers in our initial training and proceed with the remaining 81.85%ile as the the data imbalance percentage doesn't change even after droping those data points**

In [None]:
# remove outliers greater than 81.85%ile of no_of_days_employed
train_data = train_data[train_data['no_of_days_employed']<=train_data['no_of_days_employed'].quantile(0.8185)]

In [None]:
# check the shape of remaining data
train_data.shape

In [None]:
# boxplot of all continuous variables

plt.figure(figsize=[15, 10])
plt.subplot(2,3,1)
train_data['net_yearly_income'].plot.box()
plt.title('net_yearly_income Box Plot')

plt.subplot(2,3,2)
train_data['no_of_days_employed'].plot.box()
plt.title('no_of_days_employed Box Plot')

plt.subplot(2,3,3)
train_data['yearly_debt_payments'].plot.box()
plt.title('yearly_debt_payments Box Plot')

plt.subplot(2,3,4)
train_data['credit_limit'].plot.box()
plt.title('credit_limit Box Plot')

plt.subplot(2,3,5)
train_data['credit_score'].plot.box()
plt.title('credit_score Box Plot')
plt.show()

In [None]:
train_data.head()

### Data Preparation

In [None]:
# dropping text columns that aren't necessary like 'customer_id' and 'name'
train_data.drop(['customer_id', 'name'], axis=1, inplace=True)
train_data.head()

**Encode Categorical Column labels with value between 0 and n_classes-1**

In [None]:
lm = LabelEncoder()

# get categorical columns
cat_cols = train_data.select_dtypes(include=['object']).columns

# encode labels for categorical columns
for i in cat_cols:
    train_data[i] = lm.fit_transform(train_data[i])
    
train_data.head()

## Train-Test Split

In [None]:
# split train-test data
X = train_data.drop(['credit_card_default'], axis=1)
y = train_data['credit_card_default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=100)

print("X=",X_train.shape, X_test.shape)
print("y=",y_train.shape, y_test.shape)

### Feature Scaling

In [None]:
# scaling the features to standardize
scaler = MinMaxScaler()

cols = X_train.columns
X_train[cols] = scaler.fit_transform(X_train[cols])
X_test[cols] = scaler.transform(X_test[cols])

In [None]:
X_train.head()

#### Data Imbalance Handling

Using SMOTE method, we can balance the data w.r.t. credit_card_default variable and proceed further

In [None]:
# initialize SMOTE method
sm = SMOTE(random_state=42)
X_train,y_train = sm.fit_resample(X_train,y_train)

In [None]:
print("Dimension of X_train_sm Shape:", X_train.shape)
print("Dimension of y_train_sm Shape:", y_train.shape)

In [None]:
X_train = pd.DataFrame(data=X_train, columns=cols)
print("X_train.shape:",X_train.shape)
y_train = pd.Series(y_train)
print("y_train.shape:",y_train.shape)
X_train.head()

## Logistic Regression

In [None]:
# initialize LogisticRegression and fit the model
logreg = LogisticRegression(solver='liblinear', random_state=42)
logreg.fit(X_train, y_train)

In [None]:
# evaluation function
def evaluation(model):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print("="*50)
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))
    print("Accuracy of TRAIN data:", 100*accuracy_score(y_train, y_train_pred))
    print("F1_Score of TRAIN data:", 100*f1_score(y_train, y_train_pred, average="macro"))
    print("="*50)
    print(confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))
    print("Accuracy of TEST data:", 100*accuracy_score(y_test, y_test_pred))
    print("F1_Score of TEST data:", 100*f1_score(y_test, y_test_pred, average="macro"))
    print("="*50)
    
    # Plot ROC_AUC Curve
    plot_roc_curve(model, X_test, y_test)
    plt.title('ROC_AUC Curve', fontsize=16)
    plt.show()

**Prediction**

In [None]:
# prediction
evaluation(logreg)

## Random Forest

**Using Hyper-parameter Tuning**

In [None]:
params = {
    'max_depth': [10, 16, 20, 30],
    'min_samples_leaf': [10, 30, 50, 80, 100],
    'max_features': [14, 15, 16],
    'n_estimators': [30, 50, 100, 200]}

rfm_basic = RandomForestClassifier(random_state=42, oob_score=True)

grid_search = GridSearchCV(estimator=rfm_basic, param_grid=params,
                          cv=5, n_jobs=-1, verbose=1, scoring="accuracy")

In [None]:
%%time
grid_search.fit(X_train, y_train)

In [None]:
rfm_best = grid_search.best_estimator_
rfm_best

In [None]:
# prediction and model evaluation
evaluation(rfm_best)

## XGBoost Model

**Using Hyper-Parameter Tuning**

In [None]:
# # hyperparameters
# xgb_model = XGBClassifier(random_state=42, n_jobs=-1)

# parameters = {'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
#               'max_depth': [8, 10, 16, 20, 30],
#               'min_child_weight': [10, 30, 50, 80, 100],
#               'n_estimators': [50, 100, 150, 200, 300]}

# # scorer = metrics.make_scorer(metrics.accuracy_score,
# #                              greater_is_better=True,
# #                              needs_proba=True,
# #                              needs_threshold=False)
# clf_xgb = GridSearchCV(estimator=xgb_model,
#                                        param_grid=parameters,
#                                        n_jobs=-1,
#                                        cv=3,
#                                        scoring='accuracy',
#                                        refit=True)

# clf_xgb.fit(X_train, y_train)

In [None]:
# print(clf_xgb.best_params_)
# print(clf_xgb.best_score_)
# print(clf_xgb.best_estimator_)

In [None]:
# #prediction and evaluation
# xgb_best_model = clf_xgb.best_estimator_
# evaluation(xgb_best_model)

## Using ANN

In [None]:
# ANN Architecture
model = keras.Sequential([
    keras.layers.Dense(17, input_shape=(17,), activation='relu'),
    keras.layers.Dense(15, activation='relu'),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

In [None]:
epochs=20
history = model.fit(X_train, y_train,epochs=epochs,
         validation_data=(X_test, y_test))

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(12, 7))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
model.evaluate(X_test, y_test)

In [None]:
y_pred_test = model.predict(X_test)
y_pred_test[:5]

In [None]:
# converting 2D array to 1D array
y_pred_test = np.reshape(y_pred_test, len(y_pred_test))
y_pred_test[:10]

In [None]:
# create a dataframe for the probabilities
new_evaluation_df = pd.DataFrame({'CustIDs': y_test.index, 'Defaulters': y_test.values, 'Default_Prob':y_pred_test})
new_evaluation_df.head()

In [None]:
# roc_auc curve
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( new_evaluation_df.Defaulters, new_evaluation_df.Default_Prob, drop_intermediate = False )

In [None]:
draw_roc(new_evaluation_df.Defaulters, new_evaluation_df.Default_Prob)

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    new_evaluation_df[i]= new_evaluation_df.Default_Prob.map(lambda x: 1 if x > i else 0)
new_evaluation_df.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(new_evaluation_df.Defaulters, new_evaluation_df[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

Lets add probabilities from 0.6 to 0.7

In [None]:
numbers = [float(x)/100 for x in range(61,70)]
for i in numbers:
    new_evaluation_df[i]= new_evaluation_df.Default_Prob.map(lambda x: 1 if x > i else 0)
new_evaluation_df.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.60, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.70]
for i in num:
    cm1 = metrics.confusion_matrix(new_evaluation_df.Defaulters, new_evaluation_df[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

From the above graph the cutoff point could be around **prob=0.675**

In [None]:
# setting cutoff point at 0.675
new_evaluation_df['0.675']= new_evaluation_df.Default_Prob.map(lambda x: 1 if x > 0.675 else 0)
new_evaluation_df.head()

## Predict on Actual TEST Data

In [None]:
test_data.head()

In [None]:
test_data.shape

In [None]:
test_data.isna().mean()*100

In [None]:
test_data.isna().sum().sort_values(ascending=False).head(10)

impute <i>no_of_children</i> with mode of the data

impute <i>owns_car</i> using mode of the data

impute <i>no_of_days_employed</i> using median of the data

In [None]:
# fill no_of_children with mode of the data
test_data['no_of_children'].fillna(test_data['no_of_children'].mode()[0], inplace=True)

# fill owns_car with mode of the data
test_data['owns_car'].fillna(test_data['owns_car'].mode()[0], inplace=True)

# fill no_of_days_employed with median of the data
test_data['no_of_days_employed'].fillna(test_data['no_of_days_employed'].median(), inplace=True)

impute <i>total_family_members</i> with mode, <i>migrant_worker</i> with mode, <i>yearly_debt_payments</i> with median and <i>credit_score</i> with median

In [None]:
# fillna wrt to their medians and modes
test_data['total_family_members'].fillna(test_data['total_family_members'].mode()[0], inplace=True)
test_data['migrant_worker'].fillna(test_data['migrant_worker'].mode()[0], inplace=True)
test_data['yearly_debt_payments'].fillna(test_data['yearly_debt_payments'].median(), inplace=True)
test_data['credit_score'].fillna(test_data['credit_score'].median(), inplace=True)

In [None]:
# check for null values
test_data.isna().sum()

In [None]:
test_data.head()

In [None]:
# get all categorical features
cols = test_data.columns 

for i in cols:
    if len(test_data[i].value_counts())<=20:
        print("Column Name:",i)
        print(test_data[i].value_counts())
        print("*"*50)

In [None]:
test_data[test_data['gender'] == 'XNA']

In [None]:
# lets impute with F as its mode of the feature
test_data['gender'] = test_data['gender'].apply(lambda x: 'F' if x=='XNA' else x)
test_data['gender'].value_counts()

In [None]:
# get the cutomer_id column to new dataframe and drop in the original data fram
final_df = pd.DataFrame(test_data.customer_id)

In [None]:
test_data.drop(['customer_id', 'name'], axis=1, inplace=True)
test_data.head()

In [None]:
# binning of net_yearly_income into categories
def income_binning(x):
    if x <= 125824.5050:
        return 'Low'
    elif x > 125824.5050 and x <= 170968.0300:
        return 'Below Avg'
    elif x > 170968.0300 and x <= 237768.9200:
        return 'Above Avg'
    else:
        return 'High'


In [None]:
test_data['income_categories'] = test_data.net_yearly_income.apply(income_binning)

In [None]:
test_data.head()

### Data Preparation

#### Label Encoding

In [None]:
# get categorical columns
cat_cols_test = test_data.select_dtypes(include=['object']).columns

# encode labels for categorical columns
for i in cat_cols_test:
    test_data[i] = lm.fit_transform(test_data[i])
    
test_data.head()

In [None]:
test_data.head()

In [None]:
cols_test = test_data.columns
cols_test

#### Feature Scaling

In [None]:
cols_test = test_data.columns

test_data[cols_test] = scaler.transform(test_data[cols_test])
test_data.head()

#### Logitic Regression Prediction

In [None]:
predicted_target = logreg.predict(test_data)
predicted_target[:10]

In [None]:
final_df['credit_card_default'] = predicted_target

In [None]:
final_df.head()

In [None]:
# # save it to file and submit
# final_df.to_csv('LogRegg_submission.csv', index=False)
# print('Output Saved to CSV File!!')

#### Random Forest Prediction

In [None]:
random_final_df = pd.DataFrame(final_df['customer_id'])

random_predicted_target = rfm_best.predict(test_data)
random_predicted_target[:10]

In [None]:
random_final_df['credit_card_default'] = random_predicted_target

In [None]:
random_final_df.head()

In [None]:
# # save it to file and submit
# random_final_df.to_csv('RandomForest_submission.csv', index=False)
# print('Output Saved to CSV File!!')

## Here We came to know that both outputs generated by LogReg and RandomForest are same.

#### XGBoost Prediction

In [None]:
# xgboost_final_df = pd.DataFrame(final_df['customer_id'])

# xgboost_predicted_target = xgb_best_model.predict(test_data)
# xgboost_predicted_target[:10]

#### Artificical Neural Networks Prediction

In [None]:
ann_final_df = pd.DataFrame(final_df['customer_id'])

ann_predicted_probs = model.predict(test_data)

# converting 2D array to 1D array
ann_predicted_probs = np.reshape(ann_predicted_probs, len(ann_predicted_probs))
ann_predicted_probs[:10]

In [None]:
ann_final_df['ann_probs'] = ann_predicted_probs

# setting optimal cutoff of 0.675
ann_final_df['credit_card_default']= ann_final_df.ann_probs.map(lambda x: 1 if x > 0.675 else 0)

# drop 'ann_probs' column
ann_final_df.drop('ann_probs', axis=1, inplace=True)
ann_final_df.head()

In [None]:
# # save it to file and submit
# ann_final_df.to_csv('ANeuralNets_submission.csv', index=False)
# print('Output Saved to CSV File!!')