### Importing libraries

In [79]:
import pandas as pd

from sklearn import model_selection
from sklearn import ensemble
from sklearn import pipeline
from sklearn import metrics

from feature_engine import imputation
from feature_engine import encoding

### Load data

In [80]:
df = pd.read_csv('../data/interim/class_record.csv')
df.CNT_FAM_MEMBERS = pd.to_numeric(df.CNT_FAM_MEMBERS, downcast="integer")
df.head(3)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,CLASS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2,1
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2,1
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2,0


### Data dictionary

<b>1. ID:</b> Client number.<br>
<b>2. CODE_GENDER:</b> Gender.<br>
<b>3. FLAG_OWN_CAR:</b> Is there a car.<br>
<b>4. FLAG_OWN_REALTY:</b> Is there a property.<br>
<b>5. CNT_CHILDREN:</b> Number of children.<br>
<b>6. AMT_INCOME_TOTAL:</b> Annual income.<br>
<b>7. NAME_INCOME_TYPE:</b> Income category.<br>
<b>8. NAME_EDUCATION_TYPE:</b> Education level.<br>
<b>9. NAME_FAMILY_STATUS:</b> Marital status.<br>
<b>10. NAME_HOUSING_TYPE:</b> Way of living.<br>
<b>11. DAYS_BIRTH:</b> Birthday. Count backwards from current day (0), -1 means yesterday.<br>
<b>12. DAYS_EMPLOYED:</b> Start date of employment. Count backwards from current day(0). If positive, it means the person currently unemployed.<br>
<b>13. FLAG_MOBIL:</b> Is there a mobile phone.<br>
<b>14. FLAG_WORK_PHONE:</b> Is there a work phone.<br>
<b>15. FLAG_PHONE:</b> Is there a phone.<br>
<b>16. FLAG_EMAIL:</b> Is there an email.<br>
<b>17. OCCUPATION_TYPE:</b> Occupation.<br>
<b>18. CNT_FAM_MEMBERS:</b> Family size.<br>
<b>19. CLASS:</b><br>

Link: https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction

In [81]:
features = df.columns[1:-1]
target = 'CLASS'

display(features)

Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS'],
      dtype='object')

### Split the data into train and test sets

In [82]:
random_state = 42

X_train, X_test, y_train, y_test = model_selection.train_test_split(df[features], 
                                                                    df[target],
                                                                    train_size=0.8, 
                                                                    stratify=df[target], 
                                                                    random_state=random_state)

print('Train target proportion: ')
print(f'{y_train.value_counts(normalize=True)}')
print(f'Rows: {X_train.shape[0]}, Columns: {X_train.shape[1]}')
print('\n')
print('Test target proportion: ')
print(f'{y_test.value_counts(normalize=True)}')
print(f'Rows: {X_test.shape[0]}, Columns: {X_test.shape[1]}')

Train target proportion: 
CLASS
0    0.713046
1    0.286954
Name: proportion, dtype: float64
Rows: 29165, Columns: 17


Test target proportion: 
CLASS
0    0.712973
1    0.287027
Name: proportion, dtype: float64
Rows: 7292, Columns: 17


In [83]:
X_train.loc[X_train.duplicated(keep=False)].sort_values(by=list(X_train.columns), axis=0)

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
31382,F,N,N,0,33300.0,Pensioner,Secondary / secondary special,Married,Office apartment,-19605,365243,1,0,0,0,,2
31378,F,N,N,0,33300.0,Pensioner,Secondary / secondary special,Married,Office apartment,-19605,365243,1,0,0,0,,2
31379,F,N,N,0,33300.0,Pensioner,Secondary / secondary special,Married,Office apartment,-19605,365243,1,0,0,0,,2
31384,F,N,N,0,33300.0,Pensioner,Secondary / secondary special,Married,Office apartment,-19605,365243,1,0,0,0,,2
31383,F,N,N,0,33300.0,Pensioner,Secondary / secondary special,Married,Office apartment,-19605,365243,1,0,0,0,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31853,M,Y,Y,5,202500.0,Working,Secondary / secondary special,Married,Co-op apartment,-11384,-2727,1,0,0,0,Managers,7
31851,M,Y,Y,5,202500.0,Working,Secondary / secondary special,Married,Co-op apartment,-11384,-2727,1,0,0,0,Managers,7
31857,M,Y,Y,5,202500.0,Working,Secondary / secondary special,Married,Co-op apartment,-11384,-2727,1,0,0,0,Managers,7
14671,M,Y,Y,14,225000.0,Working,Secondary / secondary special,Separated,House / apartment,-17754,-1689,1,0,0,0,Drivers,15


In [84]:
X_train.isna().sum()

CODE_GENDER               0
FLAG_OWN_CAR              0
FLAG_OWN_REALTY           0
CNT_CHILDREN              0
AMT_INCOME_TOTAL          0
NAME_INCOME_TYPE          0
NAME_EDUCATION_TYPE       0
NAME_FAMILY_STATUS        0
NAME_HOUSING_TYPE         0
DAYS_BIRTH                0
DAYS_EMPLOYED             0
FLAG_MOBIL                0
FLAG_WORK_PHONE           0
FLAG_PHONE                0
FLAG_EMAIL                0
OCCUPATION_TYPE        9053
CNT_FAM_MEMBERS           0
dtype: int64

### Imputation for NaN values

In [85]:
X_train.OCCUPATION_TYPE.unique()

array([nan, 'Laborers', 'Drivers', 'Sales staff', 'Core staff',
       'Managers', 'High skill tech staff', 'Cooking staff',
       'Accountants', 'Private service staff', 'Medicine staff',
       'Cleaning staff', 'Low-skill Laborers', 'Waiters/barmen staff',
       'Security staff', 'HR staff', 'Realty agents', 'Secretaries',
       'IT staff'], dtype=object)

In [86]:
imput_nan = imputation.CategoricalImputer(imputation_method='missing',
                                          variables=['OCCUPATION_TYPE'])

### Encoding for categorical variables

In [87]:
cat_columns = X_train.select_dtypes(include=['object']).columns.tolist()

In [88]:
one_hot_encoder = encoding.OneHotEncoder(
    top_categories=None,
    variables=cat_columns,
    drop_last=True
)

### Model Selection and Training

In [89]:
rfc = ensemble.RandomForestClassifier(random_state=random_state)

params = {
    'n_estimators': [200, 300, 400, 500],
    'min_samples_leaf': [10, 20, 50, 100]
}

grid = model_selection.GridSearchCV(rfc,
                                    param_grid=params,
                                    scoring='roc_auc',
                                    n_jobs=-1)

model = pipeline.Pipeline([
    ('imput', imput_nan),
    ('one_hot', one_hot_encoder),
    ('model', grid)
])

model.fit(X_train, y_train)

### Predict and Metrics

In [90]:
y_train_predict = model.predict(X_train)
y_train_proba = model.predict_proba(X_train)[:,1]

y_test_predict = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:,1]

In [91]:
acc_train = metrics.accuracy_score(y_train, y_train_predict)
auc_train = metrics.roc_auc_score(y_train, y_train_proba)

acc_test = metrics.accuracy_score(y_test, y_test_predict)
auc_test = metrics.roc_auc_score(y_test, y_test_proba)

In [92]:
count_class_0 = df[df['CLASS'] == 0].shape[0]
count_all_classes = df.shape[0]
no_to_every_client = count_class_0 / count_all_classes

In [93]:
print(f'Saying NO to every client: {no_to_every_client}')
print(f'Accuracy train: {acc_train}')
print(f'Accuracy test: {acc_test}')

print('\n')

print(f'AUC Score train: {auc_train}')
print(f'AUC Score test: {auc_test}')

Saying NO to every client: 0.7130317908769235
Accuracy train: 0.7285787759300532
Accuracy test: 0.7216127262753703


AUC Score train: 0.842532687736419
AUC Score test: 0.7039495080966267


In [94]:
feature_importances = model.named_steps['model'].best_estimator_.feature_importances_

In [96]:
feature_importances

array([0.02888994, 0.15111059, 0.21379867, 0.17857052, 0.        ,
       0.01878609, 0.02529811, 0.01249389, 0.03654493, 0.02155725,
       0.02403189, 0.0269192 , 0.01883036, 0.01712425, 0.00954105,
       0.00418575, 0.0125724 , 0.01228626, 0.00501957, 0.00213114,
       0.01619719, 0.00974399, 0.01140812, 0.00818545, 0.01093423,
       0.00591687, 0.00361247, 0.005805  , 0.00153353, 0.01515791,
       0.01498318, 0.00765069, 0.01198859, 0.0117282 , 0.01044067,
       0.00627584, 0.0035046 , 0.00714163, 0.00226745, 0.00491566,
       0.00231301, 0.00129371, 0.00185925, 0.00295453, 0.00052503,
       0.00131489, 0.00065643])