# Imports

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Load data

In [15]:
df = pd.read_csv('data/raw/cs-training.csv', index_col=0)
print(df.shape)
df.head(3)

(150000, 11)


Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0


# Cut test and save

In [16]:
df_train, df_test = train_test_split(df, random_state=16, test_size=3000)
df_train.shape, df_test.shape

((147000, 11), (3000, 11))

In [17]:
df_test.to_csv('data/raw/test.csv', index=False)

# Prepare data

## Train-val split

In [18]:
df_train, df_val = train_test_split(df_train, random_state=16, test_size=7000)
df_train.shape, df_val.shape

((140000, 11), (7000, 11))

## Handle missing data

всі колонки числові. заповнимо медіанами + додамо колонки _is_NA

In [19]:
na_count = df_train.isna().sum()[df_train.isna().sum() != 0]
na_count

MonthlyIncome         27785
NumberOfDependents     3675
dtype: int64

In [20]:
na_cols = list(na_count.index)
na_cols

['MonthlyIncome', 'NumberOfDependents']

In [21]:
mis_dict = {}

for c in na_cols:
    isna_train = df_train[c].isna()
    isna_val = df_val[c].isna()
    df_train[f'isna_{c}'] = True
    df_train.loc[isna_train, f'isna_{c}'] = True
    df_val[f'isna_{c}'] = True
    df_val.loc[isna_val, f'isna_{c}'] = True
    
    median_value = df_train[c].median()
    df_train.loc[isna_train, c] = median_value
    df_val.loc[isna_val, c] = median_value
    
    mis_dict[c] = median_value

In [22]:
df_train.isna().sum().sum(), df_val.isna().sum().sum()

(0, 0)

In [24]:
import pickle as pkl

with open('models/missings.pkl', 'wb') as handle:
    pkl.dump(mis_dict, handle)

# Save data and models

In [12]:
df_train.to_csv('data/ready/train_rf.csv', index=False)

In [13]:
df_val.to_csv('data/ready/val_rf.csv', index=False)