# Preparing data

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

In [64]:
original_df = pd.read_csv('CreditDataset_with_missing_income.csv')
categorial_feature_names=['Home', 'Marital','Job']
numeric_features_names = ['Seniority', 'Time', 'Age', 'Records', 'Expenses', 'Income', 'Assets', 'Debt', 'Amount', 'Price']
missing_values = {'Income': [0, 99999999], 'Assets': [99999999], 'Debt': [99999999], 'Home': [0]}
target = 'Income'
estimator_dict = {'Home': DecisionTreeClassifier(max_depth= 7, criterion = 'entropy')}

1 Status	credit status
2 Seniority	job seniority (years)
3 Home	    type of home ownership
4 Time	    time of requested loan
5 Age	    client's age
6 Marital	marital status
7 Records	existance of records
8 Job	    type of job
9 Expenses	amount of expenses
10 Income	amount of income
11 Assets	amount of assets
12 Debt	    amount of debt
13 Amount	amount requested of loan
14 Price	price of good

In [41]:
original_df.shape

(4040, 13)

In [42]:
original_df.head()

Unnamed: 0,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,9,1,60,30,2,1,3,73,0,0,0,800,846
1,17,1,60,58,3,1,1,48,0,0,0,1000,1658
2,10,2,36,46,2,2,3,90,0,3000,0,2000,2985
3,0,1,60,24,1,1,1,63,0,2500,0,900,1325
4,0,1,36,26,1,1,1,46,0,0,0,310,910


Предобработка данных (valid_df and df_with_missing_values_in_target)

In [43]:
valid_value_index = get_valid_values_index(original_df[target], missing_values[target])
invalid_value_index = np.invert(valid_value_index)

In [44]:
train_df = original_df[valid_value_index]
train_df.head()

Unnamed: 0,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
5,1,2,60,36,2,1,1,75,214,3500,0,650,1645
6,29,2,60,44,2,1,1,75,125,10000,0,1600,1800
7,9,5,12,27,1,1,1,35,80,0,0,200,1093
8,0,2,60,32,2,1,3,90,107,15000,0,1200,1957
9,0,5,48,41,2,1,2,90,80,0,0,1200,1468


In [45]:
df_with_missing_target = original_df[invalid_value_index]
df_with_missing_target

Unnamed: 0,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,9,1,60,30,2,1,3,73,0,0,0,800,846
1,17,1,60,58,3,1,1,48,0,0,0,1000,1658
2,10,2,36,46,2,2,3,90,0,3000,0,2000,2985
3,0,1,60,24,1,1,1,63,0,2500,0,900,1325
4,0,1,36,26,1,1,1,46,0,0,0,310,910


Удаляем из тренировочного df строки с пропусками

In [46]:
clean_train_df = train_df[clean_train_df_index(train_df, missing_values, target)]

In [47]:
print("Размер датасета для обучения {}".format(clean_train_df.shape))
print("Размер датасета для заполнения {}".format(df_with_missing_target.shape))

Размер датасета для обучения (4035, 13)
Размер датасета для заполнения (5, 13)


Разбили на 2 df


# Нормирование вещественных парметров

Сперва склеиваем train and df_with_missing_values

In [48]:
clean_train_df['Source'] = 'Train'

In [50]:
df_with_missing_target['Source'] = 'Calc'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [51]:
all_df = pd.concat([clean_train_df, df_with_missing_target])

In [52]:
all_df.head()

Unnamed: 0,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price,Source
5,1,2,60,36,2,1,1,75,214,3500,0,650,1645,Train
6,29,2,60,44,2,1,1,75,125,10000,0,1600,1800,Train
7,9,5,12,27,1,1,1,35,80,0,0,200,1093,Train
8,0,2,60,32,2,1,3,90,107,15000,0,1200,1957,Train
9,0,5,48,41,2,1,2,90,80,0,0,1200,1468,Train


In [53]:
all_df.shape

(4040, 14)

In [54]:
def extract_numeric_features(df, numeric_features_names):
    return df[numeric_features_names]

In [55]:
numeric_df = extract_numeric_features(all_df, numeric_features_names)
numeric_df.head()

Unnamed: 0,Seniority,Time,Age,Records,Expenses,Income,Assets,Debt,Amount,Price
5,1,60,36,1,75,214,3500,0,650,1645
6,29,60,44,1,75,125,10000,0,1600,1800
7,9,12,27,1,35,80,0,0,200,1093
8,0,60,32,1,90,107,15000,0,1200,1957
9,0,48,41,1,90,80,0,0,1200,1468


In [56]:
with warnings.catch_warnings(record=True):
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    scaled_numeric_df = pd.DataFrame(scaler.fit_transform(numeric_df))
    scaled_numeric_df.index = numeric_df.index
    scaled_numeric_df.columns = numeric_df.columns
scaled_numeric_df.head()

Unnamed: 0,Seniority,Time,Age,Records,Expenses,Income,Assets,Debt,Amount,Price
5,-0.877476,0.91165,-0.085986,-0.443089,0.991101,0.897027,-0.16713,-0.313101,-0.828439,0.322698
6,2.558079,0.91165,0.639998,-0.443089,0.991101,-0.204621,0.528944,-0.313101,1.241409,0.578788
7,0.104111,-2.389327,-0.902718,-0.443089,-1.076679,-0.761634,-0.54194,-0.313101,-1.808894,-0.589312
8,-1.000174,0.91165,-0.448978,-0.443089,1.766519,-0.427426,1.064386,-0.313101,0.369894,0.838182
9,-1.000174,0.086406,0.367754,-0.443089,1.766519,-0.761634,-0.54194,-0.313101,0.369894,0.03026



# Добавление категориальных призаков

In [58]:
categorical_df = all_df[categorial_feature_names]
categorical_df.head()

Unnamed: 0,Home,Marital,Job
5,2,2,1
6,2,2,1
7,5,1,1
8,2,2,3
9,5,2,2


In [59]:
enc = OneHotEncoder(sparse=False)
encoded_array = enc.fit_transform(categorical_df)
encoded_df = pd.DataFrame(encoded_array)
encoded_df.columns = enc.get_feature_names()
encoded_df.head()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,x0_1.0,x0_2.0,x0_3.0,x0_4.0,x0_5.0,x0_6.0,x1_0.0,x1_1.0,x1_2.0,x1_3.0,x1_4.0,x1_5.0,x2_1.0,x2_2.0,x2_3.0,x2_4.0
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [60]:
all_prepared_df = pd.merge(scaled_numeric_df, encoded_df, how='left', left_index=True, right_index=True)
all_prepared_df.head()

Unnamed: 0,Seniority,Time,Age,Records,Expenses,Income,Assets,Debt,Amount,Price,...,x1_0.0,x1_1.0,x1_2.0,x1_3.0,x1_4.0,x1_5.0,x2_1.0,x2_2.0,x2_3.0,x2_4.0
5,-0.877476,0.91165,-0.085986,-0.443089,0.991101,0.897027,-0.16713,-0.313101,-0.828439,0.322698,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,2.558079,0.91165,0.639998,-0.443089,0.991101,-0.204621,0.528944,-0.313101,1.241409,0.578788,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,0.104111,-2.389327,-0.902718,-0.443089,-1.076679,-0.761634,-0.54194,-0.313101,-1.808894,-0.589312,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,-1.000174,0.91165,-0.448978,-0.443089,1.766519,-0.427426,1.064386,-0.313101,0.369894,0.838182,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,-1.000174,0.086406,0.367754,-0.443089,1.766519,-0.761634,-0.54194,-0.313101,0.369894,0.03026,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [61]:
final_df = pd.merge(all_df['Source'].to_frame(), all_prepared_df, how='left', left_index=True, right_index=True)
final_df.head()

Unnamed: 0,Source,Seniority,Time,Age,Records,Expenses,Income,Assets,Debt,Amount,...,x1_0.0,x1_1.0,x1_2.0,x1_3.0,x1_4.0,x1_5.0,x2_1.0,x2_2.0,x2_3.0,x2_4.0
5,Train,-0.877476,0.91165,-0.085986,-0.443089,0.991101,0.897027,-0.16713,-0.313101,-0.828439,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,Train,2.558079,0.91165,0.639998,-0.443089,0.991101,-0.204621,0.528944,-0.313101,1.241409,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,Train,0.104111,-2.389327,-0.902718,-0.443089,-1.076679,-0.761634,-0.54194,-0.313101,-1.808894,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,Train,-1.000174,0.91165,-0.448978,-0.443089,1.766519,-0.427426,1.064386,-0.313101,0.369894,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,Train,-1.000174,0.086406,0.367754,-0.443089,1.766519,-0.761634,-0.54194,-0.313101,0.369894,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


Разделяем на 2 датасета

In [62]:
train_indexes = get_valid_values_index(final_df['Source'], ['Calc','Xzzzzz'])

In [63]:
final_train_df = final_df[train_indexes]
final_predict_df = final_df[np.invert(train_indexes)]
del final_train_df['Source']
del final_predict_df['Source']

In [511]:
print("Размер датасета для обучения {}".format(final_train_df.shape))
print("Размер датасета для заполнения {}".format(final_predict_df.shape))

Размер датасета для обучения (4033, 27)
Размер датасета для заполнения (5, 27)


In [510]:
final_train_df.head()

Unnamed: 0,Seniority,Time,Age,Records,Expenses,Income,Assets,Debt,Amount,Price,...,x1_0.0,x1_1.0,x1_2.0,x1_3.0,x1_4.0,x1_5.0,x2_1.0,x2_2.0,x2_3.0,x2_4.0
2,0.227082,-0.738356,0.82202,2.256213,1.766409,0.722478,-0.220904,-0.313186,2.112327,2.536195,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.999779,0.912063,-1.174943,-0.443221,0.370839,0.49933,-0.274439,-0.313186,-0.283822,-0.206142,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.999779,-0.738356,-0.993401,-0.443221,-0.507854,-0.430455,-0.542113,-0.313186,-1.569029,-0.891727,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,-0.877093,0.912063,-0.085691,-0.443221,0.991092,0.896038,-0.16737,-0.313186,-0.828401,0.322501,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,2.558117,0.912063,0.640478,-0.443221,0.991092,-0.207307,0.528583,-0.313186,1.241001,0.578562,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
