In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, make_scorer, classification_report, precision_score, confusion_matrix, precision_recall_curve, auc, recall_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
import torch.optim as optim
import copy

In [2]:
application = pd.read_csv('application_record.csv')
credit_records = pd.read_csv('credit_record.csv')

In [3]:
application.shape

(438557, 18)

In [4]:
application["CODE_GENDER"].unique()

array(['M', 'F'], dtype=object)

In [5]:
application["FLAG_OWN_CAR"].unique()

array(['Y', 'N'], dtype=object)

In [6]:
application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

In [7]:
application.describe()

Unnamed: 0,ID,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS
count,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0
mean,6022176.0,0.42739,187524.3,-15997.904649,60563.675328,1.0,0.206133,0.287771,0.108207,2.194465
std,571637.0,0.724882,110086.9,4185.030007,138767.799647,0.0,0.404527,0.452724,0.310642,0.897207
min,5008804.0,0.0,26100.0,-25201.0,-17531.0,1.0,0.0,0.0,0.0,1.0
25%,5609375.0,0.0,121500.0,-19483.0,-3103.0,1.0,0.0,0.0,0.0,2.0
50%,6047745.0,0.0,160780.5,-15630.0,-1467.0,1.0,0.0,0.0,0.0,2.0
75%,6456971.0,1.0,225000.0,-12514.0,-371.0,1.0,0.0,1.0,0.0,3.0
max,7999952.0,19.0,6750000.0,-7489.0,365243.0,1.0,1.0,1.0,1.0,20.0


In [8]:
application.nunique()

ID                     438510
CODE_GENDER                 2
FLAG_OWN_CAR                2
FLAG_OWN_REALTY             2
CNT_CHILDREN               12
AMT_INCOME_TOTAL          866
NAME_INCOME_TYPE            5
NAME_EDUCATION_TYPE         5
NAME_FAMILY_STATUS          5
NAME_HOUSING_TYPE           6
DAYS_BIRTH              16379
DAYS_EMPLOYED            9406
FLAG_MOBIL                  1
FLAG_WORK_PHONE             2
FLAG_PHONE                  2
FLAG_EMAIL                  2
OCCUPATION_TYPE            18
CNT_FAM_MEMBERS            13
dtype: int64

In [9]:
occupation_nulls = application['OCCUPATION_TYPE'].isnull()

X_train = application[~occupation_nulls].drop('OCCUPATION_TYPE', axis=1)
y_train = application[~occupation_nulls]['OCCUPATION_TYPE']

X_pred = application[occupation_nulls].drop('OCCUPATION_TYPE', axis=1)

X_train_encoded = pd.get_dummies(X_train, columns=["CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", 
                                                  "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", 
                                                  "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE"])
X_pred_encoded = pd.get_dummies(X_pred, columns=["CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", 
                                                "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", 
                                                "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE"])

X_pred_encoded, X_train_encoded = X_pred_encoded.align(X_train_encoded, join='left', axis=1, fill_value=0)

model = RandomForestClassifier(random_state=42)
model.fit(X_train_encoded, y_train)

predictions = model.predict(X_pred_encoded)

application.loc[occupation_nulls, 'OCCUPATION_TYPE'] = predictions

In [10]:
application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

In [11]:
prepared_application = pd.get_dummies(application, columns=["CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "OCCUPATION_TYPE"]).astype(int)

In [12]:
prepared_application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 55 columns):
 #   Column                                             Non-Null Count   Dtype
---  ------                                             --------------   -----
 0   ID                                                 438557 non-null  int32
 1   CNT_CHILDREN                                       438557 non-null  int32
 2   AMT_INCOME_TOTAL                                   438557 non-null  int32
 3   DAYS_BIRTH                                         438557 non-null  int32
 4   DAYS_EMPLOYED                                      438557 non-null  int32
 5   FLAG_MOBIL                                         438557 non-null  int32
 6   FLAG_WORK_PHONE                                    438557 non-null  int32
 7   FLAG_PHONE                                         438557 non-null  int32
 8   FLAG_EMAIL                                         438557 non-null  int32
 9   CNT_FAM_MEMBERS

In [13]:
credit_records.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [14]:
credit_records["STATUS"] = credit_records["STATUS"].replace(['X', 'C'], 0)

In [15]:
credit_records.STATUS.unique()

array([0, '0', '1', '2', '3', '4', '5'], dtype=object)

In [16]:
credit_records.drop(columns=['MONTHS_BALANCE'], inplace=True)

In [17]:
credit_records["STATUS"] = credit_records["STATUS"].astype(int)

In [18]:
result = credit_records.groupby('ID').agg({'STATUS': lambda x: (x**3).sum() / len(x)}).reset_index()

In [19]:
result["DECISION"] = result["STATUS"].apply(lambda x: 1 if x < 4 else 0) # ЗДЕСЬ ЗАДАЕТСЯ ПОРОГОВОЕ ЗНАЧЕНИЕ

In [20]:
result[result["DECISION"] == 0]

Unnamed: 0,ID,STATUS,DECISION
1631,5003804,70.000000,0
2219,5004559,10.326087,0
2450,5004891,12.500000,0
2695,5005205,6.213115,0
3250,5008827,19.473684,0
...,...,...,...
45105,5149190,104.166667,0
45107,5149192,105.769231,0
45621,5149828,36.583333,0
45626,5149834,41.208333,0


In [21]:
correlation_matrix = prepared_application.corr()

In [22]:
dataset = pd.merge(prepared_application, result, on='ID')

In [23]:
dataset.drop(columns=['ID', "FLAG_MOBIL"], inplace=True)

In [24]:
correlation_matrix = dataset.corr()

In [25]:
dataset.drop(columns=['STATUS'], inplace=True)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=['DECISION']), dataset['DECISION'], test_size=0.2, random_state=42, stratify=dataset['DECISION'])

In [27]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
oversample = SMOTE()
X_train_scaled, y_train = oversample.fit_resample(X_train_scaled, y_train)

In [29]:
y_train.sum() / len(y_train)

0.5

In [30]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    scale_pos_weight=10
)

In [31]:
xgb_model.fit(X_train_scaled, y_train)
y_pred = xgb_model.predict(X_test_scaled)

Parameters: { "use_label_encoder" } are not used.



In [32]:
y_probs = xgb_model.predict_proba(X_test_scaled)[:, 1]

threshold = 0.9
y_pred = (y_probs >= threshold).astype(int)

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.32      0.30      0.31        40
           1       1.00      1.00      1.00      7252

    accuracy                           0.99      7292
   macro avg       0.66      0.65      0.65      7292
weighted avg       0.99      0.99      0.99      7292



In [34]:
print(accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred))

0.9927317608337904 0.9965526751241037 0.9961405926946933 0.9963465913007513


In [35]:
conf_matrix = confusion_matrix(y_test, y_pred)

In [36]:
conf_matrix

array([[  12,   28],
       [  25, 7227]], dtype=int64)

In [37]:
xgb_m = xgb.XGBClassifier(
    objective='binary:logistic',  
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    scale_pos_weight=10
)

In [38]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5, 10]
}

In [39]:
scorer = make_scorer(recall_score, pos_label=0)

grid_search = GridSearchCV(estimator=xgb_m, param_grid=param_grid, cv=5, scoring=scorer, verbose=1, n_jobs=-1)

# grid_search.fit(X_train_scaled, y_train)

In [40]:
best_model = grid_search.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [41]:
y_pred = best_model.predict(X_test_scaled)

NameError: name 'best_model' is not defined

In [42]:
conf_matrix = confusion_matrix(y_test, y_pred)

In [43]:
conf_matrix

array([[  12,   28],
       [  25, 7227]], dtype=int64)

In [44]:
y_probs = best_model.predict_proba(X_test_scaled)[:, 1]

threshold = 0.999
y_pred = (y_probs >= threshold).astype(int)

NameError: name 'best_model' is not defined

In [45]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[  12,   28],
       [  25, 7227]], dtype=int64)

In [46]:
dataset.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,CODE_GENDER_F,CODE_GENDER_M,...,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,DECISION
0,0,427500,-12005,-4542,1,0,0,2,0,1,...,0,1,0,0,0,0,0,0,0,1
1,0,427500,-12005,-4542,1,0,0,2,0,1,...,0,1,0,0,0,0,0,0,0,1
2,0,112500,-21474,-1134,0,0,0,2,0,1,...,0,0,0,0,0,0,0,1,0,1
3,0,270000,-19110,-3051,0,1,1,1,1,0,...,0,0,0,0,0,1,0,0,0,1
4,0,270000,-19110,-3051,0,1,1,1,1,0,...,0,0,0,0,0,1,0,0,0,1


In [47]:
class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(53, 128)
        self.dropout1 = nn.Dropout(0.3)  

        self.fc2 = nn.Linear(128, 128)
        self.dropout2 = nn.Dropout(0.4)  

        self.fc3 = nn.Linear(128, 64)
        self.dropout3 = nn.Dropout(0.3)

        self.fc4 = nn.Linear(64, 32)
        self.dropout4 = nn.Dropout(0.2)  

        self.fc5 = nn.Linear(32, 32)
        self.dropout5 = nn.Dropout(0.1)

        self.fc6 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)
        
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)
        
        x = self.relu(self.fc3(x))
        x = self.dropout3(x)
        
        x = self.relu(self.fc4(x))
        x = self.dropout4(x)
    
        x = self.relu(self.fc5(x))
        x = self.dropout5(x)

        x = self.fc6(x)
        return x

In [48]:
model = BinaryClassifier()
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [49]:
model

BinaryClassifier(
  (fc1): Linear(in_features=53, out_features=128, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (dropout2): Dropout(p=0.4, inplace=False)
  (fc3): Linear(in_features=128, out_features=64, bias=True)
  (dropout3): Dropout(p=0.3, inplace=False)
  (fc4): Linear(in_features=64, out_features=32, bias=True)
  (dropout4): Dropout(p=0.2, inplace=False)
  (fc5): Linear(in_features=32, out_features=32, bias=True)
  (dropout5): Dropout(p=0.1, inplace=False)
  (fc6): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
)

In [50]:
model.train()
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train.values).reshape(-1, 1)

In [51]:
best_model = copy.deepcopy(model)
for epoch in range(0, 200):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()

    best_metric = 0

    if epoch % 10 == 0:
        print("Epoch: ", epoch, loss.item())
        model.eval()
        with torch.no_grad():
            train_outputs = model(X_train_tensor)
            train_probs = torch.sigmoid(train_outputs)
            train_preds = (train_probs > 0.5).int().cpu().numpy()
            true_vals = y_train_tensor.int().cpu().numpy()
            cm = confusion_matrix(true_vals, train_preds)
            print(cm)
            recall = cm[1, 1] / (cm[1, 0] + cm[1, 1]) if (cm[1, 0] + cm[1, 1]) > 0 else 0
            specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1]) if (cm[0, 0] + cm[0, 1]) > 0 else 0
            print("Recall:", recall, "specificity:", specificity)
            metric = recall
        
            if metric > best_metric and epoch > 30:
                best_metric = metric
                best_model = copy.deepcopy(model)
                torch.save(model.state_dict(), 'best_model.pth')
                print("Model saved.")
        model.train()

Epoch:  0 0.6961548924446106
[[29004     0]
 [29004     0]]
Recall: 0.0 specificity: 1.0
Epoch:  10 0.688594400882721
[[20789  8215]
 [ 9963 19041]]
Recall: 0.6564956557716177 specificity: 0.716763205075162
Epoch:  20 0.6609948873519897
[[ 9856 19148]
 [ 2011 26993]]
Recall: 0.9306647358984967 specificity: 0.33981519790373743
Epoch:  30 0.6049875617027283
[[21799  7205]
 [ 7936 21068]]
Recall: 0.726382567921666 specificity: 0.751585988139567
Epoch:  40 0.5302464962005615
[[26067  2937]
 [ 7768 21236]]
Recall: 0.7321748724313888 specificity: 0.8987381050889532
Model saved.
Epoch:  50 0.44293463230133057
[[28325   679]
 [ 6977 22027]]
Recall: 0.7594469728313336 specificity: 0.9765894359398704
Model saved.
Epoch:  60 0.35657399892807007
[[28814   190]
 [ 5240 23764]]
Recall: 0.8193352641015033 specificity: 0.9934491794235278
Model saved.
Epoch:  70 0.2789064943790436
[[28902   102]
 [ 3678 25326]]
Recall: 0.8731899048407116 specificity: 0.9964832436905254
Model saved.
Epoch:  80 0.2120735

In [67]:
best_model.eval()
y_pred = best_model(torch.FloatTensor(X_test_scaled))
y_pred = torch.sigmoid(y_pred)
y_pred = (y_pred > 0.5).int().cpu().numpy()
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[  15,   25],
       [ 181, 7071]], dtype=int64)

In [60]:
print(accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred))

0.9717498628634119 0.9750413678985107 0.9964768883878241 0.9856425982715361
