In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
import pickle

In [2]:
datatset = pd.read_csv('../data/merged_data.csv')

In [3]:
datatset.head()

Unnamed: 0,ID,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_HOUSING_TYPE,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,MONTHS_BALANCE,APPROVED
0,5008806,Y,Y,0,112500.0,Working,Secondary / secondary special,House / apartment,-1134,1,0,0,0,Security staff,2.0,0,1
1,5008806,Y,Y,0,112500.0,Working,Secondary / secondary special,House / apartment,-1134,1,0,0,0,Security staff,2.0,-1,1
2,5008806,Y,Y,0,112500.0,Working,Secondary / secondary special,House / apartment,-1134,1,0,0,0,Security staff,2.0,-2,1
3,5008806,Y,Y,0,112500.0,Working,Secondary / secondary special,House / apartment,-1134,1,0,0,0,Security staff,2.0,-3,1
4,5008806,Y,Y,0,112500.0,Working,Secondary / secondary special,House / apartment,-1134,1,0,0,0,Security staff,2.0,-4,1


In [4]:
datatset = datatset.drop('ID', axis=1)

In [5]:
train_set, test_set = train_test_split(datatset, test_size=0.25, random_state=42)

In [6]:
train_set, val_set = train_test_split(train_set, test_size=0.2, random_state=42)

In [7]:
scaler = StandardScaler()
ordinal_encoder = OrdinalEncoder()
onehot_encoder = OneHotEncoder()

In [8]:
datatset['NAME_INCOME_TYPE'].unique()

array(['Working', 'Commercial associate', 'State servant', 'Student',
       'Pensioner'], dtype=object)

In [9]:
datatset['NAME_EDUCATION_TYPE'].unique()

array(['Secondary / secondary special', 'Higher education',
       'Incomplete higher', 'Lower secondary', 'Academic degree'],
      dtype=object)

In [10]:
datatset['NAME_HOUSING_TYPE'].unique()

array(['House / apartment', 'Rented apartment', 'Municipal apartment',
       'With parents', 'Co-op apartment', 'Office apartment'],
      dtype=object)

In [11]:
datatset['OCCUPATION_TYPE'].unique()

array(['Security staff', 'Sales staff', 'Accountants', 'Laborers',
       'Managers', 'Drivers', 'Core staff', 'High skill tech staff',
       'Cleaning staff', 'Private service staff', 'Cooking staff',
       'Low-skill Laborers', 'Medicine staff', 'Secretaries',
       'Waiters/barmen staff', 'HR staff', 'Realty agents', 'IT staff'],
      dtype=object)

In [12]:
train_set_encoded = pd.get_dummies(train_set, columns=['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'], dtype=int)
val_set_encoded = pd.get_dummies(val_set, columns=['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'], dtype=int)
test_set_encoded = pd.get_dummies(test_set, columns=['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'], dtype=int)

In [13]:
train_set_encoded['FLAG_OWN_CAR'] = train_set_encoded['FLAG_OWN_CAR'].replace({'Y': 1, 'N': 0})
train_set_encoded['FLAG_OWN_REALTY'] = train_set_encoded['FLAG_OWN_REALTY'].replace({'Y': 1, 'N': 0})

In [14]:
val_set_encoded['FLAG_OWN_CAR'] = val_set_encoded['FLAG_OWN_CAR'].replace({'Y': 1, 'N': 0})
val_set_encoded['FLAG_OWN_REALTY'] = val_set_encoded['FLAG_OWN_REALTY'].replace({'Y': 1, 'N': 0})

In [15]:
test_set_encoded['FLAG_OWN_CAR'] = test_set_encoded['FLAG_OWN_CAR'].replace({'Y': 1, 'N': 0})
test_set_encoded['FLAG_OWN_REALTY'] = test_set_encoded['FLAG_OWN_REALTY'].replace({'Y': 1, 'N': 0})

In [16]:
train_set_encoded[['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'MONTHS_BALANCE']] = scaler.fit_transform(train_set_encoded[['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'MONTHS_BALANCE']])

In [17]:
val_set_encoded[['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'MONTHS_BALANCE']] = scaler.fit_transform(val_set_encoded[['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'MONTHS_BALANCE']])

In [18]:
test_set_encoded[['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'MONTHS_BALANCE']] = scaler.fit_transform(test_set_encoded[['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'MONTHS_BALANCE']])

In [19]:
train_set_encoded[['CNT_FAM_MEMBERS', 'CNT_CHILDREN']] = ordinal_encoder.fit_transform(train_set_encoded[['CNT_FAM_MEMBERS', 'CNT_CHILDREN']])

In [20]:
val_set_encoded[['CNT_FAM_MEMBERS', 'CNT_CHILDREN']] = ordinal_encoder.fit_transform(val_set_encoded[['CNT_FAM_MEMBERS', 'CNT_CHILDREN']])

In [21]:
test_set_encoded[['CNT_FAM_MEMBERS', 'CNT_CHILDREN']] = ordinal_encoder.fit_transform(test_set_encoded[['CNT_FAM_MEMBERS', 'CNT_CHILDREN']])

In [22]:
rf_classifier = RandomForestClassifier(criterion="log_loss", n_estimators=100, max_depth=8)
rf_classifier.fit(train_set_encoded.drop(columns=['APPROVED'], axis=1), train_set_encoded[['APPROVED']])

  return fit_method(estimator, *args, **kwargs)


In [23]:
accuracy_score(val_set_encoded[['APPROVED']], rf_classifier.predict(val_set_encoded.drop(columns=['APPROVED'], axis=1)))

0.6287662740235586

In [24]:
filename = '../output_model/model.sav'
pickle.dump(rf_classifier, open(filename, 'wb'))

In [33]:
ann = Sequential()
ann.add(tf.keras.layers.Dense(units=128,activation="relu", input_dim = 45))
ann.add(tf.keras.layers.Dense(units=64,activation="relu"))
ann.add(tf.keras.layers.Dense(units=32,activation="relu"))
ann.add(tf.keras.layers.Dropout(rate=0.2))
ann.add(tf.keras.layers.Dense(units=16,activation="relu"))
ann.add(tf.keras.layers.Dropout(rate=0.2))
ann.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))
ann.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])
ann.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 128)               5888      
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 dense_7 (Dense)             (None, 16)                528       
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_8 (Dense)             (None, 1)                

In [34]:
ann.fit(x=train_set_encoded.drop(columns=['APPROVED'], axis=1), y=train_set_encoded[['APPROVED']], validation_data=(val_set_encoded.drop(columns=['APPROVED'], axis=1), val_set_encoded[['APPROVED']]), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1c6050c3ca0>

In [29]:
result = ann.predict(test_set_encoded.drop(columns=['APPROVED'], axis=1))



In [30]:
predict_rs = [0 if i < 0.5 else 1 for i in result]

In [31]:
val_set_encoded[['APPROVED']]

Unnamed: 0,APPROVED
526282,1
123051,1
110643,0
39773,0
438724,1
...,...
487377,0
321207,0
337332,0
194877,1


In [32]:
accuracy_score(test_set_encoded[['APPROVED']], predict_rs)

0.6806728315614841