In [76]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

Загрузка данных из файлов

In [77]:
applications = pd.read_csv("datasets/application_record.csv")
credits = pd.read_csv("datasets/credit_record.csv")

Проверка данных

In [78]:
applications.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [79]:
applications.shape

(438557, 18)

Преобразуем название столбцов

In [80]:
applications.rename(columns={"CODE_GENDER": "gender", "FLAG_OWN_CAR": "f_car", "FLAG_OWN_REALTY": "f_realty",
                             "CNT_CHILDREN": "cnt_children", "AMT_INCOME_TOTAL": "total_income",
                             "NAME_INCOME_TYPE": "income_type", "NAME_EDUCATION_TYPE": "education",
                             "NAME_FAMILY_STATUS": "family_status", "NAME_HOUSING_TYPE": "housing",
                             "DAYS_BIRTH": "birthday", "DAYS_EMPLOYED": "employment_days", "FLAG_MOBIL": "f_mobile", "FLAG_WORK_PHONE": "f_work_phone", "FLAG_PHONE": "f_phone", "FLAG_EMAIL": "f_email", "OCCUPATION_TYPE": "occupation", "CNT_FAM_MEMBERS": "cnt_fam_members"}, inplace=True)

Проверим типы данных

In [81]:
applications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   ID               438557 non-null  int64  
 1   gender           438557 non-null  object 
 2   f_car            438557 non-null  object 
 3   f_realty         438557 non-null  object 
 4   cnt_children     438557 non-null  int64  
 5   total_income     438557 non-null  float64
 6   income_type      438557 non-null  object 
 7   education        438557 non-null  object 
 8   family_status    438557 non-null  object 
 9   housing          438557 non-null  object 
 10  birthday         438557 non-null  int64  
 11  employment_days  438557 non-null  int64  
 12  f_mobile         438557 non-null  int64  
 13  f_work_phone     438557 non-null  int64  
 14  f_phone          438557 non-null  int64  
 15  f_email          438557 non-null  int64  
 16  occupation       304354 non-null  obje

Проверяем пропуски

In [82]:
applications.isnull().sum()

ID                      0
gender                  0
f_car                   0
f_realty                0
cnt_children            0
total_income            0
income_type             0
education               0
family_status           0
housing                 0
birthday                0
employment_days         0
f_mobile                0
f_work_phone            0
f_phone                 0
f_email                 0
occupation         134203
cnt_fam_members         0
dtype: int64

Начинаем заполнять пропуски при помощи RandomForestClassifier, нам нужно обязательно заполнить occupation, так как это там много пропусков и это кощунство удалять их

In [83]:
columns_o = applications.columns.drop("occupation")
known = applications[applications["occupation"].notnull()].copy()
unknown = applications[applications["occupation"].isna()].copy()

X_known = known[columns_o]
y_known = known['occupation']
X_unknown = unknown[columns_o]

In [84]:
cat_cols = X_known.select_dtypes(include=['object']).columns.tolist()
num_cols = X_known.select_dtypes(include=[np.number]).columns.tolist()

In [85]:
preprocessor = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), cat_cols),
], remainder='passthrough')


clf = Pipeline([
    ('prep', preprocessor),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=2005))
])

In [86]:
clf.fit(X_known, y_known)
preds = clf.predict(X_unknown)

In [87]:
applications.loc[applications['occupation'].isna(), "occupation"] = preds

In [88]:
applications.isnull().sum()

ID                 0
gender             0
f_car              0
f_realty           0
cnt_children       0
total_income       0
income_type        0
education          0
family_status      0
housing            0
birthday           0
employment_days    0
f_mobile           0
f_work_phone       0
f_phone            0
f_email            0
occupation         0
cnt_fam_members    0
dtype: int64

In [89]:
applications.duplicated(subset=['ID']).sum()

np.int64(47)

In [90]:
applications = applications.drop_duplicates(subset=['ID'])