### About Dataset

<table>
  <thead>
    <tr>
      <th>application_record.csv</th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Feature name</td>
      <td>Explanation</td>
      <td>Remarks</td>
    </tr>
    <tr>
      <td>ID</td>
      <td>Client number</td>
      <td></td>
    </tr>
    <tr>
      <td>CODE_GENDER</td>
      <td>Gender</td>
      <td></td>
    </tr>
    <tr>
      <td>FLAG_OWN_CAR</td>
      <td>Is there a car</td>
      <td></td>
    </tr>
    <tr>
      <td>FLAG_OWN_REALTY</td>
      <td>Is there a property</td>
      <td></td>
    </tr>
    <tr>
      <td>CNT_CHILDREN</td>
      <td>Number of children</td>
      <td></td>
    </tr>
    <tr>
      <td>AMT_INCOME_TOTAL</td>
      <td>Annual income</td>
      <td></td>
    </tr>
    <tr>
      <td>NAME_INCOME_TYPE</td>
      <td>Income category</td>
      <td></td>
    </tr>
    <tr>
      <td>NAME_EDUCATION_TYPE</td>
      <td>Education level</td>
      <td></td>
    </tr>
    <tr>
      <td>NAME_FAMILY_STATUS</td>
      <td>Marital status</td>
      <td></td>
    </tr>
    <tr>
      <td>NAME_HOUSING_TYPE</td>
      <td>Way of living</td>
      <td></td>
    </tr>
    <tr>
      <td>DAYS_BIRTH</td>
      <td>Birthday</td>
      <td>Count backwards from current day (0), -1 means yesterday</td>
    </tr>
    <tr>
      <td>DAYS_EMPLOYED</td>
      <td>Start date of employment</td>
      <td>Count backwards from current day(0). If positive, it means the person currently unemployed.</td>
    </tr>
    <tr>
      <td>FLAG_MOBIL</td>
      <td>Is there a mobile phone</td>
      <td></td>
    </tr>
    <tr>
      <td>FLAG_WORK_PHONE</td>
      <td>Is there a work phone</td>
      <td></td>
    </tr>
    <tr>
      <td>FLAG_PHONE</td>
      <td>Is there a phone</td>
      <td></td>
    </tr>
    <tr>
      <td>FLAG_EMAIL</td>
      <td>Is there an email</td>
      <td></td>
    </tr>
    <tr>
      <td>OCCUPATION_TYPE</td>
      <td>Occupation</td>
      <td></td>
    </tr>
    <tr>
      <td>CNT_FAM_MEMBERS</td>
      <td>Family size</td>
      <td></td>
    </tr>
  </tbody>
</table> 

Link: https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction

### Importing libraries

In [1]:
import pandas as pd

from sklearn import model_selection
from sklearn import ensemble
from sklearn import pipeline
from sklearn import metrics

from feature_engine import imputation
from feature_engine import encoding

### Load data

In [2]:
df = pd.read_csv('../data/interim/class_record.csv')
df.CNT_FAM_MEMBERS = pd.to_numeric(df.CNT_FAM_MEMBERS, downcast="integer")
df.head(3)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,CLASS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2,1
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2,1
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2,1


In [3]:
features = df.columns[1:-1]
target = 'CLASS'

### Split the data into train and test sets

In [4]:
random_state = 42

X_train, X_test, y_train, y_test = model_selection.train_test_split(df[features], 
                                                                    df[target],
                                                                    train_size=0.2, 
                                                                    stratify=df[target], 
                                                                    random_state=random_state)

print(f'y_train mean: {y_train.mean().round(5)}')
print(f'y_test mean: {y_test.mean().round(5)}')

y_train mean: 0.38074
y_test mean: 0.38068


### Imputation for categorical variables

In [5]:
X_train.isna().sum()

CODE_GENDER               0
FLAG_OWN_CAR              0
FLAG_OWN_REALTY           0
CNT_CHILDREN              0
AMT_INCOME_TOTAL          0
NAME_INCOME_TYPE          0
NAME_EDUCATION_TYPE       0
NAME_FAMILY_STATUS        0
NAME_HOUSING_TYPE         0
DAYS_BIRTH                0
DAYS_EMPLOYED             0
FLAG_MOBIL                0
FLAG_WORK_PHONE           0
FLAG_PHONE                0
FLAG_EMAIL                0
OCCUPATION_TYPE        2240
CNT_FAM_MEMBERS           0
dtype: int64

In [6]:
X_train.OCCUPATION_TYPE.unique()

array(['Core staff', 'Sales staff', 'Laborers', 'Realty agents',
       'Accountants', 'Cooking staff', nan, 'Managers',
       'Private service staff', 'Medicine staff', 'Drivers',
       'Cleaning staff', 'High skill tech staff', 'Security staff',
       'Low-skill Laborers', 'Secretaries', 'Waiters/barmen staff',
       'IT staff', 'HR staff'], dtype=object)

In [7]:
imput_nan = imputation.CategoricalImputer(imputation_method='missing',
                                          variables=['OCCUPATION_TYPE'])

### Encoding for categorical variables

In [8]:
cat_columns = X_train.select_dtypes(include=['object']).columns.tolist()

In [9]:
one_hot_encoder = encoding.OneHotEncoder(
    top_categories=None,
    variables=cat_columns,
    drop_last=True
)

### Model Selection and Training

In [10]:
rfc = ensemble.RandomForestClassifier(random_state=random_state)

params = {
    'n_estimators': [200, 300, 400, 500],
    'min_samples_leaf': [10, 20, 50, 100]
}

grid = model_selection.GridSearchCV(rfc,
                                    param_grid=params,
                                    scoring='roc_auc',
                                    n_jobs=-1)

model = pipeline.Pipeline([
    ('imput', imput_nan),
    ('one_hot', one_hot_encoder),
    ('model', grid)
])

model.fit(X_train, y_train)

### Predict and Metrics

In [11]:
y_train_predict = model.predict(X_train)
y_train_proba = model.predict_proba(X_train)[:,1]

y_test_predict = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:,1]

In [12]:
acc_train = metrics.accuracy_score(y_train, y_train_predict)
auc_train = metrics.roc_auc_score(y_train, y_train_proba)

acc_test = metrics.accuracy_score(y_test, y_test_predict)
auc_test = metrics.roc_auc_score(y_test, y_test_proba)

In [13]:
count_class_0 = df[df['CLASS'] == 0].shape[0]
count_all_classes = df.shape[0]
no_to_every_client = count_class_0 / count_all_classes

In [14]:
print(f'Saying NO to every client: {no_to_every_client}')
print(f'Accuracy train: {acc_train}')
print(f'Accuracy test: {acc_test}')

print('\n')

print(f'AUC Score train: {auc_train}')
print(f'AUC Score test: {auc_test}')

Saying NO to every client: 0.6193049345804646
Accuracy train: 0.6619119462350843
Accuracy test: 0.6251114311184256


AUC Score train: 0.8466384865051173
AUC Score test: 0.5949041380177634


In [18]:
feature_importances = model.named_steps['model'].best_estimator_.feature_importances_

importance_df = pd.DataFrame({
    'Feature': model.named_steps['one_hot'].get_feature_names_out(),
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

In [19]:
importance_df

Unnamed: 0,Feature,Importance
2,DAYS_BIRTH,0.204681
3,DAYS_EMPLOYED,0.184194
1,AMT_INCOME_TOTAL,0.149511
8,CNT_FAM_MEMBERS,0.033562
0,CNT_CHILDREN,0.027601
10,FLAG_OWN_CAR_Y,0.027063
6,FLAG_PHONE,0.025515
11,FLAG_OWN_REALTY_Y,0.025108
9,CODE_GENDER_M,0.02393
5,FLAG_WORK_PHONE,0.02018
