In [116]:
pwd

'C:\\Users\\anhyn'

In [39]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [40]:
# Import dataset
data_df=pd.read_csv('ATB_EC_MM_sample_regression.csv')

In [41]:
# Preview data
data_df.head()

Unnamed: 0,PartyCode,CreditStatusCode,AccountStatus,HardshipCount,DNPCount,DelinqAlert,BadDebtor
0,100799312,2,1,0,7,0,0
1,101219354,2,1,1,4,1,1
2,100758128,15,1,3,0,1,1
3,100921719,14,1,3,2,1,1
4,100017556,7,1,3,2,0,0


In [42]:
#dropping customer ID column
data_df=data_df.drop('PartyCode',axis=1)
data_df.shape

(2260, 6)

In [43]:
#1-2 explore missing values
data_df.isna().sum()

CreditStatusCode    0
AccountStatus       0
HardshipCount       0
DNPCount            0
DelinqAlert         0
BadDebtor           0
dtype: int64

In [44]:
# filling missing values with mean
#data_df=data_df.fillna(data_df.mean())

In [45]:
# explore missing values post missing value fix
#data_df.isna().sum()

In [51]:
# Train Test Split - define x and y
y = data_df.pop('BadDebtor').values
x = data_df.values

In [52]:
# Splitting dataset into training and test (80:20)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [53]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [54]:
#Risk Model Building
classifier = LogisticRegression(solver='newton-cg')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

In [55]:
#Model performance
print(confusion_matrix(y_test,y_pred))

[[305   6]
 [  3 138]]


In [56]:
print(accuracy_score(y_test, y_pred))

0.9800884955752213


In [57]:
#Writing output file
predictions = classifier.predict_proba(x_test)
predictions

array([[9.64523760e-01, 3.54762399e-02],
       [9.99934924e-01, 6.50764959e-05],
       [9.98909799e-01, 1.09020127e-03],
       [9.64443182e-01, 3.55568178e-02],
       [9.98904663e-01, 1.09533666e-03],
       [9.55749020e-01, 4.42509796e-02],
       [9.99884950e-01, 1.15049964e-04],
       [9.98424436e-01, 1.57556381e-03],
       [9.98904663e-01, 1.09533666e-03],
       [9.64523760e-01, 3.54762399e-02],
       [9.98917457e-01, 1.08254325e-03],
       [9.98912357e-01, 1.08764260e-03],
       [4.17851960e-04, 9.99582148e-01],
       [9.98095754e-01, 1.90424552e-03],
       [9.70353486e-01, 2.96465143e-02],
       [7.30244477e-02, 9.26975552e-01],
       [9.99652609e-01, 3.47390773e-04],
       [9.99758015e-01, 2.41984997e-04],
       [3.90314755e-01, 6.09685245e-01],
       [9.98456701e-01, 1.54329867e-03],
       [2.65379203e-01, 7.34620797e-01],
       [9.64604162e-01, 3.53958380e-02],
       [1.29032041e-03, 9.98709680e-01],
       [9.98693978e-01, 1.30602249e-03],
       [7.334354

In [63]:
#Writing model output
# Get the predicted probabilities for the positive class (class 1)
predictions_prob = classifier.predict_proba(x_test)[:,1]

# Create a data frame with the predicted probabilities
df_prediction_prob = pd.DataFrame(predictions_prob, columns = ['prob_1'])

# Create a data frame with the predicted class labels
df_prediction_target = pd.DataFrame(classifier.predict(x_test), columns = ['predicted_TARGET'])

# Create a data frame with the actual class labels
df_test_data_df = pd.DataFrame(y_test,columns= ['Actual Outcome'])

# Concatenate the data frames horizontally (axis=1)
dfx = pd.concat([df_test_data_df, df_prediction_prob, df_prediction_target], axis=1)

# Display the first 5 rows of the resulting data frame
dfx.head(20)


Unnamed: 0,Actual Outcome,prob_1,predicted_TARGET
0,0,0.035476,0
1,0,6.5e-05,0
2,0,0.00109,0
3,0,0.035557,0
4,0,0.001095,0
5,0,0.044251,0
6,0,0.000115,0
7,0,0.001576,0
8,0,0.001095,0
9,0,0.035476,0
