In [104]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import plotly.offline as py 
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [105]:
df_credit = pd.read_csv("dataset/credit_data.csv")

In [106]:
df_credit

Unnamed: 0,Customer_ID,Marital_Status,SSN,Occupation,Num_Bank_Accounts,Num_Credit_Cards,Credit_Utilization_Ratio,Outstanding_Debt,Monthly_Balance,Credit_Score,...,Credit_Inquiries,Total_EMI,Min_Amount_Paid,Payment_Behaviour,Collateral_Type,Collateral_Value,Interest_Rate,Due_Date_Delay,Delayed_Payments,Credit_Limit_Change
0,1,Married,821-00-0265,Scientist,3,4,22.537593,809.98,244.565317,Good,...,4.0,49.574949,No,Low_spent_Small_value_payments,Vehicle,118726,3,3,8_,11.27
1,2,Separated,031-35-0942,Architect,10,8,24.713861,3571.7,345.743101,Poor,...,9.0,60.964772,Yes,Low_spent_Small_value_payments,Stocks,146222,29,33,25,18.31
2,3,Married,004-07-5839,Teacher,2,4,32.933856,605.03,356.078109,Good,...,2.0,18.816215,No,Low_spent_Small_value_payments,Stocks,191951,6,3,4,5.42
3,4,Single,419-82-4747,Lawyer,6,8,28.265031,1360.45,273.319589,Poor,...,9.0,100.069628,Yes,Low_spent_Small_value_payments,Stocks,169782,27,59,15,8.76
4,5,Single,411-59-6345,Musician,3,3,32.445302,728.23,280.293057,Good,...,5.0,91.925669,No,Low_spent_Small_value_payments,No Collateral,164196,7,14,9,10.81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,Divorced,328-52-8554,Entrepreneur,8,5,26.484560,699.69,146.645743,Standard,...,5.0,93.506721,Yes,Low_spent_Small_value_payments,No Collateral,0,6,29,13,7.72
96,97,Single,518-01-2262,Scientist,9,9,38.337449,3589.56,423.981577,Standard,...,15.0,171.345560,NM,High_spent_Small_value_payments,Vehicle,174066,18,23,15,17.24
97,98,Married,465-93-9571,_______,5,6,35.742540,1194.28,214.903420,Good,...,2.0,56.311421,No,Low_spent_Small_value_payments,Vehicle,54303,12,11,2,4.18
98,99,Divorced,860-36-1556,Writer,3,4,37.536348,404.51,479.824034,Standard,...,2.0,414.656013,Yes,High_spent_Small_value_payments,Property,87196,15,3,12,10.1


In [107]:
df_credit.columns

Index(['Customer_ID', 'Marital_Status', 'SSN', 'Occupation',
       'Num_Bank_Accounts', 'Num_Credit_Cards', 'Credit_Utilization_Ratio',
       'Outstanding_Debt', 'Monthly_Balance', 'Credit_Score', 'Credit_Mix',
       'Num_Loans', 'Loan_Types', 'Credit_Inquiries', 'Total_EMI',
       'Min_Amount_Paid', 'Payment_Behaviour', 'Collateral_Type',
       'Collateral_Value', 'Interest_Rate', 'Due_Date_Delay',
       'Delayed_Payments', 'Credit_Limit_Change'],
      dtype='object')

In [108]:
df_credit.drop(["Customer_ID", "SSN", "Due_Date_Delay", "Marital_Status", "Occupation", "Num_Bank_Accounts", "Monthly_Balance", "Collateral_Value", 
                "Interest_Rate", "Due_Date_Delay", "Credit_Limit_Change", "Loan_Types"], axis=1, inplace=True)

In [109]:
df_credit.columns

Index(['Num_Credit_Cards', 'Credit_Utilization_Ratio', 'Outstanding_Debt',
       'Credit_Score', 'Credit_Mix', 'Num_Loans', 'Credit_Inquiries',
       'Total_EMI', 'Min_Amount_Paid', 'Payment_Behaviour', 'Collateral_Type',
       'Delayed_Payments'],
      dtype='object')

In [110]:
# Encoding Credit_Score
credit_score_mapping = {
    'Poor': 0,
    'Standard': 1,
    'Good': 2
}
df_credit['Credit_Score'] = df_credit['Credit_Score'].map(credit_score_mapping)

# Encoding Credit_Mix
credit_mix_mapping = {
    'Poor': 0,
    'Standard': 1,
    'Good': 2
}
df_credit['Credit_Mix'] = df_credit['Credit_Mix'].map(credit_mix_mapping)

# Encoding Payment_Behaviour
payment_behaviour_mapping = {
    'Low_spent_Small_value_payments': 1,
    'Low_spent_Medium_value_payments': 2,
    'Low_spent_Large_value_payments': 3,
    'High_spent_Small_value_payments': 4,
    'High_spent_Medium_value_payments': 5,
    'High_spent_Large_value_payments': 6,
    '!@9#%8': 0  # Unknown/Invalid
}
df_credit['Payment_Behaviour'] = df_credit['Payment_Behaviour'].map(payment_behaviour_mapping)

# Encoding Collateral_Type
collateral_type_mapping = {
    'No Collateral': 0,
    'Vehicle': 1,
    'Stocks': 2,
    'Bonds': 3,
    'Property': 4
}
df_credit['Collateral_Type'] = df_credit['Collateral_Type'].map(collateral_type_mapping)

min_amount_paid_mapping = {
	'Yes': 1,
	'No': 0
}

df_credit['Min_Amount_Paid'] = df_credit['Min_Amount_Paid'].map(min_amount_paid_mapping)

# Display the first few rows to verify encoding
df_credit.head()

Unnamed: 0,Num_Credit_Cards,Credit_Utilization_Ratio,Outstanding_Debt,Credit_Score,Credit_Mix,Num_Loans,Credit_Inquiries,Total_EMI,Min_Amount_Paid,Payment_Behaviour,Collateral_Type,Delayed_Payments
0,4,22.537593,809.98,2,2.0,4,4.0,49.574949,0.0,1,1,8_
1,8,24.713861,3571.7,0,,5,9.0,60.964772,1.0,1,2,25
2,4,32.933856,605.03,2,2.0,1,2.0,18.816215,0.0,1,2,4
3,8,28.265031,1360.45,0,,3,9.0,100.069628,1.0,1,2,15
4,3,32.445302,728.23,2,2.0,4,5.0,91.925669,0.0,1,0,9


In [111]:
df_credit.fillna(1, inplace=True)

In [112]:
df_credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Num_Credit_Cards          100 non-null    int64  
 1   Credit_Utilization_Ratio  100 non-null    float64
 2   Outstanding_Debt          100 non-null    object 
 3   Credit_Score              100 non-null    int64  
 4   Credit_Mix                100 non-null    float64
 5   Num_Loans                 100 non-null    object 
 6   Credit_Inquiries          100 non-null    float64
 7   Total_EMI                 100 non-null    float64
 8   Min_Amount_Paid           100 non-null    float64
 9   Payment_Behaviour         100 non-null    int64  
 10  Collateral_Type           100 non-null    int64  
 11  Delayed_Payments          100 non-null    object 
dtypes: float64(5), int64(4), object(3)
memory usage: 9.5+ KB


In [113]:
df_credit.to_csv("dataset/df_credit_encoded.csv")

In [None]:
# Get counts for each credit score category
good_counts = df_credit['Credit_Score'].value_counts()['Good']
standard_counts = df_credit['Credit_Score'].value_counts()['Standard']
poor_counts = df_credit['Credit_Score'].value_counts()['Poor']

# Create the traces for the bar chart
trace0 = go.Bar(x=['Good'], y=[good_counts], name='Good Credit Score')
trace1 = go.Bar(x=['Standard'], y=[standard_counts], name='Standard Credit Score')
trace2 = go.Bar(x=['Poor'], y=[poor_counts], name='Poor Credit Score')

# Create the layout for the plot
layout = go.Layout(
    title='Credit Score Distribution',
    xaxis=dict(title='Credit Score Category'),
    yaxis=dict(title='Count')
)

# Add traces to data list
data = [trace0, trace1, trace2]

# Create the figure
fig = go.Figure(data=data, layout=layout)

# Plot the figure
py.iplot(fig, filename='credit-score-distribution')

In [None]:
df_good = df_credit.loc[df_credit["Risk"] == 'good']['Age'].values.tolist()
df_bad = df_credit.loc[df_credit["Risk"] == 'bad']['Age'].values.tolist()
df_age = df_credit['Age'].values.tolist()

trace0 = go.Histogram(
    x=df_good,
    histnorm='probability',
    name="Good Credit"
)
trace1 = go.Histogram(
    x=df_bad,
    histnorm='probability',
    name="Bad Credit"
)


fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                          subplot_titles=('Good','Bad', 'General Distribuition'))

fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)


fig['layout'].update(showlegend=True, title='Age Distribuition', bargap=0.05)
py.iplot(fig, filename='custom-sized-subplot-with-subplot-titles')

In [8]:
interval = (18, 25, 35, 60, 120)

cats = ['Student', 'Young', 'Adult', 'Senior']
df_credit["Age_cat"] = pd.cut(df_credit.Age, interval, labels=cats)


df_good = df_credit[df_credit["Risk"] == 'good']
df_bad = df_credit[df_credit["Risk"] == 'bad']

In [None]:
trace0 = go.Box(
    y=df_good["Credit amount"],
    x=df_good["Age_cat"],
    name='Good credit',
    marker=dict(
        color='#3D9970'
    )
)
trace1 = go.Box(
    y=df_bad['Credit amount'],
    x=df_bad['Age_cat'],
    name='Bad credit',
    marker=dict(
        color='#FF4136'
    )
)
data = [trace0, trace1]

layout = go.Layout(
    yaxis=dict(
        title='Credit Amount (US Dollar)',
        zeroline=False
    ),
    xaxis=dict(
        title='Age Categorical'
    ),
    boxmode='group'
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='box-age-cat')

In [None]:
trace0 = go.Bar(
    x = df_credit[df_credit["Risk"]== 'good']["Housing"].value_counts().index.values,
    y = df_credit[df_credit["Risk"]== 'good']["Housing"].value_counts().values,
    name='Good credit'
)

trace1 = go.Bar(
    x = df_credit[df_credit["Risk"]== 'bad']["Housing"].value_counts().index.values,
    y = df_credit[df_credit["Risk"]== 'bad']["Housing"].value_counts().values,
    name="Bad Credit"
)

data = [trace0, trace1]

layout = go.Layout(
    title='Housing Distribuition'
)


fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='Housing-Grouped')

In [None]:
fig = {
    "data": [
        {
            "type": 'violin',
            "x": df_good['Housing'],
            "y": df_good['Credit amount'],
            "legendgroup": 'Good Credit',
            "scalegroup": 'No',
            "name": 'Good Credit',
            "side": 'negative',
            "box": {
                "visible": True
            },
            "meanline": {
                "visible": True
            },
            "line": {
                "color": 'blue'
            }
        },
        {
            "type": 'violin',
            "x": df_bad['Housing'],
            "y": df_bad['Credit amount'],
            "legendgroup": 'Bad Credit',
            "scalegroup": 'No',
            "name": 'Bad Credit',
            "side": 'positive',
            "box": {
                "visible": True
            },
            "meanline": {
                "visible": True
            },
            "line": {
                "color": 'green'
            }
        }
    ],
    "layout" : {
        "yaxis": {
            "zeroline": False,
        },
        "violingap": 0,
        "violinmode": "overlay"
    }
}


py.iplot(fig, filename = 'violin/split', validate = False)

In [None]:
trace0 = go.Bar(
    x = df_credit[df_credit["Risk"]== 'good']["Sex"].value_counts().index.values,
    y = df_credit[df_credit["Risk"]== 'good']["Sex"].value_counts().values,
    name='Good credit'
)

trace1 = go.Bar(
    x = df_credit[df_credit["Risk"]== 'bad']["Sex"].value_counts().index.values,
    y = df_credit[df_credit["Risk"]== 'bad']["Sex"].value_counts().values,
    name="Bad Credit"
)

trace2 = go.Box(
    x = df_credit[df_credit["Risk"]== 'good']["Sex"],
    y = df_credit[df_credit["Risk"]== 'good']["Credit amount"],
    name=trace0.name
)

trace3 = go.Box(
    x = df_credit[df_credit["Risk"]== 'bad']["Sex"],
    y = df_credit[df_credit["Risk"]== 'bad']["Credit amount"],
    name=trace1.name
)

data = [trace0, trace1, trace2,trace3]


fig = tls.make_subplots(rows=1, cols=2, 
                        subplot_titles=('Sex Count', 'Credit Amount by Sex'))

fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 2)

fig['layout'].update(height=400, width=800, title='Sex Distribuition', boxmode='group')
py.iplot(fig, filename='sex-subplot')

In [None]:
trace0 = go.Bar(
    x = df_credit[df_credit["Risk"]== 'good']["Job"].value_counts().index.values,
    y = df_credit[df_credit["Risk"]== 'good']["Job"].value_counts().values,
    name='Good credit Distribuition'
)
trace1 = go.Bar(
    x = df_credit[df_credit["Risk"]== 'bad']["Job"].value_counts().index.values,
    y = df_credit[df_credit["Risk"]== 'bad']["Job"].value_counts().values,
    name="Bad Credit Distribuition"
)

data = [trace0, trace1]

layout = go.Layout(
    title='Job Distribuition'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='grouped-bar')

In [None]:
trace0 = go.Box(
    x=df_good["Job"],
    y=df_good["Credit amount"],
    name='Good credit'
)

trace1 = go.Box(
    x=df_bad['Job'],
    y=df_bad['Credit amount'],
    name='Bad credit'
)
    
data = [trace0, trace1]

layout = go.Layout(
    yaxis=dict(
        title='Credit Amount distribuition by Job'
    ),
    boxmode='group'
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='box-age-cat')

In [None]:
fig = {
    "data": [
        {
            "type": 'violin',
            "x": df_good['Job'],
            "y": df_good['Age'],
            "legendgroup": 'Good Credit',
            "scalegroup": 'No',
            "name": 'Good Credit',
            "side": 'negative',
            "box": {
                "visible": True
            },
            "meanline": {
                "visible": True
            },
            "line": {
                "color": 'blue'
            }
        },
        {
            "type": 'violin',
            "x": df_bad['Job'],
            "y": df_bad['Age'],
            "legendgroup": 'Bad Credit',
            "scalegroup": 'No',
            "name": 'Bad Credit',
            "side": 'positive',
            "box": {
                "visible": True
            },
            "meanline": {
                "visible": True
            },
            "line": {
                "color": 'green'
            }
        }
    ],
    "layout" : {
        "yaxis": {
            "zeroline": False,
        },
        "violingap": 0,
        "violinmode": "overlay"
    }
}


py.iplot(fig, filename = 'Age-Housing', validate = False)

In [None]:
import plotly.figure_factory as ff

import numpy as np

# Add histogram data
x1 = np.log(df_good['Credit amount']) 
x2 = np.log(df_bad["Credit amount"])

# Group data together
hist_data = [x1, x2]

group_labels = ['Good Credit', 'Bad Credit']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.2)

# Plot!
py.iplot(fig, filename='Distplot with Multiple Datasets')

In [None]:
print("Purpose : ",df_credit.Purpose.unique())
print("Sex : ",df_credit.Sex.unique())
print("Housing : ",df_credit.Housing.unique())
print("Saving accounts : ",df_credit['Saving accounts'].unique())
print("Risk : ",df_credit['Risk'].unique())
print("Checking account : ",df_credit['Checking account'].unique())
print("Aget_cat : ",df_credit['Age_cat'].unique())

In [19]:
def one_hot_encoder(df, nan_as_category = False):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category, drop_first=True)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [None]:
df_credit

In [23]:
df_credit['Saving accounts'] = df_credit['Saving accounts'].fillna('no_inf')
df_credit['Checking account'] = df_credit['Checking account'].fillna('no_inf')

#Purpose to Dummies Variable
df_credit = df_credit.merge(pd.get_dummies(df_credit.Purpose, drop_first=True, prefix='Purpose'), left_index=True, right_index=True)
#Sex feature in dummies
df_credit = df_credit.merge(pd.get_dummies(df_credit.Sex, drop_first=True, prefix='Sex'), left_index=True, right_index=True)
# Housing get dummies
df_credit = df_credit.merge(pd.get_dummies(df_credit.Housing, drop_first=True, prefix='Housing'), left_index=True, right_index=True)
# Housing get Saving Accounts
df_credit = df_credit.merge(pd.get_dummies(df_credit["Saving accounts"], drop_first=True, prefix='Savings'), left_index=True, right_index=True)
# Housing get Risk
df_credit = df_credit.merge(pd.get_dummies(df_credit.Risk, prefix='Risk'), left_index=True, right_index=True)
# Housing get Checking Account
df_credit = df_credit.merge(pd.get_dummies(df_credit["Checking account"], drop_first=True, prefix='Check'), left_index=True, right_index=True)
# Housing get Age categorical
df_credit = df_credit.merge(pd.get_dummies(df_credit["Age_cat"], drop_first=True, prefix='Age_cat'), left_index=True, right_index=True)

In [None]:
df_credit

In [25]:
#Excluding the missing columns
del df_credit["Saving accounts"]
del df_credit["Checking account"]
del df_credit["Purpose"]
del df_credit["Sex"]
del df_credit["Housing"]
del df_credit["Age_cat"]
del df_credit["Risk"]
del df_credit['Risk_good']

In [None]:
plt.figure(figsize=(14,12))
sns.heatmap(df_credit.astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True,  linecolor='white', annot=True)
plt.show()

In [27]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score # to split the data
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier


In [None]:
df_credit

In [29]:
df_credit['Credit amount'] = np.log(df_credit['Credit amount'])

In [None]:
df_credit['Credit amount']

In [31]:
#Creating the X and y variables
X = df_credit.drop('Risk_bad',axis=1).values
y = df_credit["Risk_bad"].values

# Spliting X and y into train and test version
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [None]:


models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('XGB', XGBClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'recall'

for name, model in models:
        kfold = KFold(n_splits=10)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
# boxplot algorithm comparison
fig = plt.figure(figsize=(11,6))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
#Seting the Hyper Parameters
param_grid = {"max_depth": [3,5, 7, 10,None],
              "n_estimators":[3,5,10,25,50,150],
              "max_features": [4,7,15,20]}

#Creating the classifier
model = RandomForestClassifier(random_state=2)

grid_search = GridSearchCV(model, param_grid=param_grid, cv=5, scoring='recall', verbose=4)
grid_search.fit(X_train, y_train)

In [None]:
print(grid_search.best_score_)
print(grid_search.best_params_)

In [None]:
rf = RandomForestClassifier(max_depth=None, max_features=10, n_estimators=15, random_state=2)

#trainning with the best params
rf.fit(X_train, y_train)

In [None]:
#Testing the model 
#Predicting using our  model
y_pred = rf.predict(X_test)

# Verificaar os resultados obtidos
print(accuracy_score(y_test,y_pred))
print("\n")
print(confusion_matrix(y_test, y_pred))
print("\n")
print(fbeta_score(y_test, y_pred, beta=2))

Model 2:

In [40]:
from sklearn.utils import resample
from sklearn.metrics import roc_curve

In [41]:
# Criando o classificador logreg
GNB = GaussianNB()

# Fitting with train data
model = GNB.fit(X_train, y_train)

In [None]:
# Printing the Training Score
print("Training score data: ")
print(model.score(X_train, y_train))


In [None]:
y_pred = model.predict(X_test)

print(accuracy_score(y_test,y_pred))
print("\n")
print(confusion_matrix(y_test, y_pred))
print("\n")
print(classification_report(y_test, y_pred))