In [1]:
import pandas as pd

from sklearn import model_selection
from sklearn import ensemble
from sklearn import pipeline

from feature_engine import imputation

In [2]:
df = pd.read_csv('../data/interim/class_record.csv')
df.head(3)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,CLASS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,1
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,1
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,1


In [3]:
features = df.columns[1:-1]
target = 'CLASS'
random_state = 42

In [4]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df[features], 
                                                                    df[target],
                                                                    train_size=0.2, 
                                                                    stratify=df[target], 
                                                                    random_state=random_state)

In [5]:
print(f'y_train mean: {y_train.mean().round(5)}')
print(f'y_test mean: {y_test.mean().round(5)}')

y_train mean: 0.38074
y_test mean: 0.38068


In [6]:
X_train.isna().sum()

CODE_GENDER               0
FLAG_OWN_CAR              0
FLAG_OWN_REALTY           0
CNT_CHILDREN              0
AMT_INCOME_TOTAL          0
NAME_INCOME_TYPE          0
NAME_EDUCATION_TYPE       0
NAME_FAMILY_STATUS        0
NAME_HOUSING_TYPE         0
DAYS_BIRTH                0
DAYS_EMPLOYED             0
FLAG_MOBIL                0
FLAG_WORK_PHONE           0
FLAG_PHONE                0
FLAG_EMAIL                0
OCCUPATION_TYPE        2240
CNT_FAM_MEMBERS           0
dtype: int64

In [7]:
X_train.OCCUPATION_TYPE.unique()

array(['Core staff', 'Sales staff', 'Laborers', 'Realty agents',
       'Accountants', 'Cooking staff', nan, 'Managers',
       'Private service staff', 'Medicine staff', 'Drivers',
       'Cleaning staff', 'High skill tech staff', 'Security staff',
       'Low-skill Laborers', 'Secretaries', 'Waiters/barmen staff',
       'IT staff', 'HR staff'], dtype=object)

In [8]:
imput_nan = imputation.CategoricalImputer(imputation_method='missing',
                                          variables=['OCCUPATION_TYPE'])

rfc = ensemble.RandomForestClassifier(random_state=random_state)

params = {
    'n_estimators': [200,300,400,500],
    'min_samples_leaf': [10,20,50,100]
}

grid = model_selection.GridSearchCV(rfc,
                                    param_grid=params,
                                    scoring='roc_auc',
                                    n_jobs=-1)

model = pipeline.Pipeline([
    ('imput', imput_nan),
    ('model', grid)
])

model.fit(X_train, y_train)