Importing libraries and functions

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score
from sklearn.linear_model import LogisticRegression

Importing Dataset

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
dataset = pd.read_excel('/content/a_Dataset_CreditScoring.xlsx')

Data preparation

In [8]:
# shows count of rows and columns
dataset.shape

(3000, 30)

In [9]:
# shows first rows in code
dataset.head()

Unnamed: 0,TARGET,ID,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1,582,3,3,0,4,0.0,5,117,27,...,3.0,0.9179,0.2083,2,3,7,0.2083,4,4,0.0
1,1,662,15,9,0,3,1.0,3,14,14,...,1.0,0.8,0.0,0,0,0,1.0,12,0,1.0
2,1,805,0,0,0,1,5.0,1,354,7,...,5.0,0.3552,0.6538,0,1,1,0.7308,1,1,0.5263
3,1,1175,8,5,0,6,1.0,10,16,4,...,3.0,0.9127,0.25,1,1,1,0.75,7,1,1.3333
4,1,1373,3,1,0,9,0.0,8,130,52,...,1.0,1.2511,0.0,0,1,4,0.1429,3,1,0.0


In [10]:
# dropping custom ID column from the dataset
dataset = dataset.drop('ID' , axis=1)
dataset.shape

(3000, 29)

In [11]:
# explore missing values
dataset.isna().sum()

TARGET               0
DerogCnt             0
CollectCnt           0
BanruptcyInd         0
InqCnt06             0
InqTimeLast        188
InqFinanceCnt24      0
TLTimeFirst          0
TLTimeLast           0
TLCnt03              0
TLCnt12              0
TLCnt24              0
TLCnt                3
TLSum               40
TLMaxSum            40
TLSatCnt             4
TLDel60Cnt           0
TLBadCnt24           0
TL75UtilCnt         99
TL50UtilCnt         99
TLBalHCPct          41
TLSatPct             4
TLDel3060Cnt24       0
TLDel90Cnt24         0
TLDel60CntAll        0
TLOpenPct            3
TLBadDerogCnt        0
TLDel60Cnt24         0
TLOpen24Pct          3
dtype: int64

In [12]:
# filling missing values with means
dataset =dataset.fillna(dataset.mean())

In [13]:
# explore missing values post missing values fix
dataset.isna().sum()

TARGET             0
DerogCnt           0
CollectCnt         0
BanruptcyInd       0
InqCnt06           0
InqTimeLast        0
InqFinanceCnt24    0
TLTimeFirst        0
TLTimeLast         0
TLCnt03            0
TLCnt12            0
TLCnt24            0
TLCnt              0
TLSum              0
TLMaxSum           0
TLSatCnt           0
TLDel60Cnt         0
TLBadCnt24         0
TL75UtilCnt        0
TL50UtilCnt        0
TLBalHCPct         0
TLSatPct           0
TLDel3060Cnt24     0
TLDel90Cnt24       0
TLDel60CntAll      0
TLOpenPct          0
TLBadDerogCnt      0
TLDel60Cnt24       0
TLOpen24Pct        0
dtype: int64

In [17]:
# # count of good loans(0) and bad loans(1)
# dataset['TARGET'].value_counts()

0    2500
1     500
Name: TARGET, dtype: int64

In [20]:
# # data summary across 0 & 1
# dataset.groupby('TARGET').mean()

Unnamed: 0_level_0,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,TLCnt03,TLCnt12,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
TARGET,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.3224,0.7664,0.1492,2.9424,3.174638,3.2896,173.002,11.65,0.2844,1.832,...,3.986711,0.628177,0.544963,0.6044,0.6624,2.2236,0.502376,1.18,0.8648,0.556867
1,1.968,1.31,0.174,3.938,2.775459,4.882,155.672,12.992,0.228,1.768,...,4.53387,0.748185,0.385173,1.334,1.576,4.014,0.465127,2.554,2.086,0.600978


Train Test split

In [21]:
y = dataset.iloc[: , 0].values
x = dataset.iloc[: , 1:28].values

In [22]:
# splitting dataset into training and test in ratio (80 : 20)
x_train , x_test , y_train , y_test = train_test_split( x, y, test_size = 0.2 , random_state=0)

In [24]:
sc =StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

Risk Model Building

In [29]:
classifier = LogisticRegression()
classifier.fit(x_train, y_train) # get w , b hyporic parameters
y_predict = classifier.predict(x_test)

Model Performance

In [30]:
print(confusion_matrix( y_test ,y_predict))  #performance if is good loans or bad loans

[[482  16]
 [ 87  15]]


In [31]:
print(accuracy_score(y_test, y_predict))

0.8283333333333334


Writng output file

In [32]:
predictions = classifier.predict_proba(x_test)
predictions

array([[0.04230096, 0.95769904],
       [0.93683282, 0.06316718],
       [0.70459693, 0.29540307],
       ...,
       [0.9724934 , 0.0275066 ],
       [0.44332485, 0.55667515],
       [0.86810657, 0.13189343]])

In [36]:
# writing model output file

df_prediction_prob = pd.DataFrame(predictions ,columns = ['prob_0','prob_1'])
df_prediction_target = pd.DataFrame(classifier.predict(x_test), columns = ['predicted_TARGET'])
df_test_dataset = pd.DataFrame(y_test , columns = ['Actual Outcome'])

dfx = pd.concat([df_test_dataset ,df_prediction_prob ,df_prediction_target] ,axis=1)
dfx.to_csv("/content/c1_Model_Prediction.xlsx", sep=',', encoding='UTF-8')
dfx.head()

Unnamed: 0,Actual Outcome,prob_0,prob_1,predicted_TARGET
0,1,0.042301,0.957699,1
1,0,0.936833,0.063167,0
2,0,0.704597,0.295403,0
3,0,0.908269,0.091731,0
4,0,0.866217,0.133783,0
