### Importing libraries & functions





In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

### Importing dataset

In [2]:
dataset=pd.read_excel("a_Dataset_CreditScoring.xlsx")

### Data preparation

In [3]:
# shows count of rows and columns
dataset.shape

(3000, 30)

In [13]:
#shows first few rows of the code
dataset.head()

Unnamed: 0,TARGET,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,TLCnt03,TLCnt12,TLCnt24,TLCnt,TLSum,TLMaxSum,TLSatCnt,TLDel60Cnt,TLBadCnt24,TL75UtilCnt,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1.0,3.0,3.0,0.0,4.0,0.0,5.0,117.0,27.0,0.0,0.0,0.0,5.0,19410.0,21147.0,16.0,2.0,1.0,3.0,3.0,0.9179,0.2083,2.0,3.0,7.0,0.2083,4.0,4.0,0.0
1,1.0,15.0,9.0,0.0,3.0,1.0,3.0,14.0,14.0,0.0,0.0,1.0,1.0,16776.0,20971.0,1.0,0.0,0.0,1.0,1.0,0.8,0.0,0.0,0.0,0.0,1.0,12.0,0.0,1.0
2,1.0,0.0,0.0,0.0,1.0,5.0,1.0,354.0,7.0,0.0,2.0,10.0,19.0,16903.0,47593.0,21.0,1.0,1.0,4.0,5.0,0.3552,0.6538,0.0,1.0,1.0,0.7308,1.0,1.0,0.5263
3,1.0,8.0,5.0,0.0,6.0,1.0,10.0,16.0,4.0,0.0,2.0,4.0,3.0,18339.0,20093.0,1.0,1.0,0.0,2.0,3.0,0.9127,0.25,1.0,1.0,1.0,0.75,7.0,1.0,1.3333
4,1.0,3.0,1.0,0.0,9.0,0.0,8.0,130.0,52.0,0.0,0.0,0.0,1.0,2327.0,1860.0,3.0,4.0,1.0,1.0,1.0,1.2511,0.0,0.0,1.0,4.0,0.1429,3.0,1.0,0.0


In [5]:
#dropping customer ID column from the dataset
dataset=dataset.drop('ID',axis=1)
dataset.shape

(3000, 29)

In [7]:
# explore missing values
dataset.isnull().sum()

TARGET               0
DerogCnt             0
CollectCnt           0
BanruptcyInd         0
InqCnt06             0
InqTimeLast        188
InqFinanceCnt24      0
TLTimeFirst          0
TLTimeLast           0
TLCnt03              0
TLCnt12              0
TLCnt24              0
TLCnt                3
TLSum               40
TLMaxSum            40
TLSatCnt             4
TLDel60Cnt           0
TLBadCnt24           0
TL75UtilCnt         99
TL50UtilCnt         99
TLBalHCPct          41
TLSatPct             4
TLDel3060Cnt24       0
TLDel90Cnt24         0
TLDel60CntAll        0
TLOpenPct            3
TLBadDerogCnt        0
TLDel60Cnt24         0
TLOpen24Pct          3
dtype: int64

In [8]:
# filling missing values with mean
dataset=dataset.fillna(dataset.mean())

In [9]:
# explore missing values post missing value fix
dataset.isnull().sum()

TARGET             0
DerogCnt           0
CollectCnt         0
BanruptcyInd       0
InqCnt06           0
InqTimeLast        0
InqFinanceCnt24    0
TLTimeFirst        0
TLTimeLast         0
TLCnt03            0
TLCnt12            0
TLCnt24            0
TLCnt              0
TLSum              0
TLMaxSum           0
TLSatCnt           0
TLDel60Cnt         0
TLBadCnt24         0
TL75UtilCnt        0
TL50UtilCnt        0
TLBalHCPct         0
TLSatPct           0
TLDel3060Cnt24     0
TLDel90Cnt24       0
TLDel60CntAll      0
TLOpenPct          0
TLBadDerogCnt      0
TLDel60Cnt24       0
TLOpen24Pct        0
dtype: int64

In [10]:
# # count of good loans (0) and bad loans (1)
dataset['TARGET'].value_counts()

0.0    2500
1.0     500
Name: TARGET, dtype: int64

In [14]:
# # data summary across 0 & 1
dataset.groupby('TARGET').mean()

Unnamed: 0_level_0,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,TLCnt03,TLCnt12,TLCnt24,TLCnt,TLSum,TLMaxSum,TLSatCnt,TLDel60Cnt,TLBadCnt24,TL75UtilCnt,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
TARGET,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
0.0,1.3224,0.7664,0.1492,2.9424,3.174638,3.2896,173.002,11.65,0.2844,1.832,3.8968,7.985104,20062.274824,31825.941529,13.822409,1.2884,0.47,3.015435,3.986711,0.628177,0.544963,0.6044,0.6624,2.2236,0.502376,1.18,0.8648,0.556867
1.0,1.968,1.31,0.174,3.938,2.775459,4.882,155.672,12.992,0.228,1.768,3.81,7.351759,20595.19953,28105.696411,11.958047,2.69,1.052,3.65292,4.53387,0.748185,0.385173,1.334,1.576,4.014,0.465127,2.554,2.086,0.600978


### Train Test Split

In [15]:
y = dataset.iloc[:, 0].values
X = dataset.iloc[:, 1:29].values

In [16]:
# splitting dataset into training and test (in ratio 80:20)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0,
                                                    stratify=y)

In [17]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [19]:
# Exporting Normalisation Coefficients for later use in prediction
import joblib
joblib.dump(sc, 'f2_Normalisation_CreditScoring')

['f2_Normalisation_CreditScoring']

### Risk Model building

In [20]:
classifier =  LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [21]:
# Exporting Logistic Regression Classifier for later use in prediction

# import joblib
joblib.dump(classifier, 'f1_Classifier_CreditScoring')

['f1_Classifier_CreditScoring']

### Model *performance*

In [22]:
print(confusion_matrix(y_test,y_pred))

[[487  13]
 [ 87  13]]


In [23]:
print(accuracy_score(y_test, y_pred))

0.8333333333333334


### Writing output file

In [24]:
predictions = classifier.predict_proba(X_test)
predictions

array([[0.61644691, 0.38355309],
       [0.9885656 , 0.0114344 ],
       [0.87069686, 0.12930314],
       ...,
       [0.94450568, 0.05549432],
       [0.46756903, 0.53243097],
       [0.94014209, 0.05985791]])

In [27]:
# writing model output file

df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
df_prediction_target = pd.DataFrame(classifier.predict(X_test), columns = ['predicted_TARGET'])
df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])

dfx=pd.concat([df_test_dataset, df_prediction_prob, df_prediction_target], axis=1)

dfx.to_csv("c1_Model_Prediction.csv", sep=',', encoding='UTF-8')

dfx.head()

Unnamed: 0,Actual Outcome,prob_0,prob_1,predicted_TARGET
0,1.0,0.616447,0.383553,0.0
1,0.0,0.988566,0.011434,0.0
2,1.0,0.870697,0.129303,0.0
3,0.0,0.953963,0.046037,0.0
4,1.0,0.726633,0.273367,0.0
