# Скачивание и подготовка данных

In [None]:
!pip install pyreadstat
!pip install catboost

In [None]:
import pandas as pd
import re
import pyreadstat
import numpy as np

In [None]:
df, meta = pyreadstat.read_sav('data.sav', apply_value_formats = True)
map_columns = meta.column_names_to_labels
maybe_leak = [
                 'xj10', 'xj10.2', 'xj15',
                 'xj238', 'xj39', 'xj40',
                 'xj57', 'xj89', 'xj13.2',
]
needed = [
    'idind', 'psu', 'site', 'status', 'popul', 
    'x_int_y', 'x_born_m', 'x_educ',  'x_diplom', 
    'x_age', 'region', 'xh5', 'xh6', 'xh7.1',
    'xh7.2', 'xh8a', 'xh8b', 'xi1', 'xi2',
    'xi3', 'xi7', 'xi8', 'xi4', 'xi3.1',
    'xj1', 'xj1.1.1', 'xj1.1.2', 'xj1.1.3',
    'xj1.1.4', 'xj2cod08', 'xj4.1', 'xj5a',
    'xj5b', 'xj6', 'xj6.0', 'xj6.1a', 'xj6.1b',
    'xj6.2', 'xj7', 'xj7.1', 'x_adult', 'xj8',
    'xj8.1', 'xj8.2', 'xj8.3', 'xj9', 'xj10', 'xj10.3',
    'xj10.2', 'xj11', 'xj11.1', 'xj11.2', 'xj13',
    'xj13.2', 'xj14', 'xj15', 'xj16', 'xj18.2', 
    'xj19', 'xj21a', 'xj21b', 'xj21.3', 'xj24', 
    'xj25', 'xj26', 'xj29', 'xj29c.1', 'xj29c.2',
    'xj22', 'xj31', 'xj29.1', 'xj29.2.1', 'xj29.2.2',
    'xj29.2.3', 'xj32', 'xj41.1', 'xj41.2', 'xj49', 
    'xj50', 'xj51', 'xj52', 'xj56', 'xj56.11c08',
    'xj56.12c08', 'xj56.13c08', 'xj57', 'xj58',
    'xj58.1', 'xj59', 'xj59.1', 'xj60', 'xj60.4a1',
    'xj60.5a'  'xj61', 'xj62', 'xj63', 'xj64',
    'xj65', 'xj66', 'xj66.1', 'xj69.9c', 'xj70.1', 'xj70', 
    'xj70.2', 'xk3.1', 'xj71', 'xj72.1a', 'xj72.1b', 
    'xj721dac08', 'xj72.1c', 'xj72.2a', 'xj72.2b', 
    'xj722dac08', 'xj72.2c', 'xj72.5a',
    'xj72.5b', 'xj725dac08',  'xj72.5c', 
    'xj72.5e', 'xj72.5h', 'xj72.5j',
    'xj72.18a', 'xj60.1',  'xj77',
    'xj260', 'xj262', 'xj322', 'xj324', 'xj72.171',
    'xj72.172', 'xj72.173', 'xj721635', 'xj721636',
    'xj73', 'xj81', 'xj81.2', 'xj81.1', 'xj83.1', 'xl20'
   ]

In [None]:
map_numeric = {key: value for key, value in map_columns.items() if re.search(r'\bсколько\b', value.lower())}
columns_numeric = [x for x in map_numeric.keys()] + [
                                        'xj13.2', 'x_int_y', 'x_age', 
                                        'site', 'xh8b', 'popul',
                                        'xh8a', 'idind', 'xh7.1'  
] + ['xh6', 'xi8', 'xj69.9c', 'xj72.5e', 'xi7', 'xj5a']
df_numeric = df[[x for x in columns_numeric if x in set(needed)]]

In [None]:
map_categorical = {el : map_columns[el] for el in set(map_columns) - set(map_numeric)}
columns_categorical = set([x for x in map_categorical.keys()]) - set(columns_numeric)
df_categorical = df[[x for x in columns_categorical if x in set(needed)]]

In [None]:
df_ok_num = df_numeric[df_numeric['xj60'].notna()].reset_index(drop=True)
df_ok_cat = df_categorical[df_numeric['xj60'].notna()].reset_index(drop=True)
print(df_ok_num.shape)
cat_features = df_ok_cat.columns

for col in df_ok_num:
  df_ok_num[col] = df_ok_num[col].astype(object)

for col in df_ok_cat:
  df_ok_cat[col] = df_ok_cat[col].astype(object)

df_ok_num = df_ok_num.fillna(-1)
df_ok_cat = df_ok_cat.fillna('Unknown')

In [None]:
def replace_non_numerical_answers_with_zero(column):
  for i, cell in enumerate(column):
    if isinstance(cell, str):
      column[i] = 0
  return column.astype(float)

df_ok_num = df_ok_num.apply(replace_non_numerical_answers_with_zero, axis=0)

In [None]:
def make_str(col):
  for i in range(len(col)):
    col[i] = str(col[i])
  return col

df_ok_cat[['xj62', 'xj64', 'xj63', 'xk3.1']] = df_ok_cat[['xj62', 'xj64', 'xj63', 'xk3.1']].apply(make_str, axis=1)

In [None]:
data = pd.concat([df_ok_num, df_ok_cat], axis=1)

# Обучение модели

In [None]:
target = data['xj60']
cols = ['xj13.2', 'xj15', 'xj10.2', 'xj57', 'xj10', 'xj60']
features = data.drop(cols, axis=1)

In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.4, random_state=12345)
features_valid, features_test, target_valid, target_test = train_test_split(
    features_test, target_test, test_size=0.5
)
train_pool = Pool(features_train, target_train, cat_features=cat_features)
val_pool = Pool(features_valid, target_valid, cat_features=cat_features)
test_pool = Pool(features_test, target_test, cat_features=cat_features)

In [None]:
model = CatBoostRegressor(verbose=500, depth=2, iterations=5000, learning_rate=4*0.068786)#, use_best_model=False)# boosting_type='Ordered')
model.fit(train_pool, eval_set=val_pool)

0:	learn: 37969.2145248	test: 21303.2717287	best: 21303.2717287 (0)	total: 43.1ms	remaining: 3m 35s
500:	learn: 15626.0939375	test: 17159.5140994	best: 17138.0675859 (335)	total: 15.4s	remaining: 2m 18s
1000:	learn: 13185.8141283	test: 16976.1775166	best: 16851.0060628 (761)	total: 30.9s	remaining: 2m 3s
1500:	learn: 12256.9298750	test: 16907.7004933	best: 16851.0060628 (761)	total: 46.4s	remaining: 1m 48s
2000:	learn: 11570.8604807	test: 16897.5314774	best: 16848.3525532 (1557)	total: 1m 1s	remaining: 1m 32s
2500:	learn: 11022.1572626	test: 16925.8607155	best: 16848.3525532 (1557)	total: 1m 17s	remaining: 1m 17s
3000:	learn: 10581.9522820	test: 16933.0554995	best: 16848.3525532 (1557)	total: 1m 32s	remaining: 1m 1s
3500:	learn: 10164.7481724	test: 16912.3967579	best: 16848.3525532 (1557)	total: 1m 48s	remaining: 46.3s
4000:	learn: 9852.3606211	test: 16938.6406894	best: 16848.3525532 (1557)	total: 2m 3s	remaining: 30.8s
4500:	learn: 9513.0276103	test: 16972.1948762	best: 16848.3525532 

<catboost.core.CatBoostRegressor at 0x7f1488ac0fd0>

# Интерпретация результатов

In [None]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
print(
    mse(train_pool.get_label(), model.predict(train_pool))**0.5
)

print(mse(data['xj60'], [data['xj60'].mean() for x in data['xj60']])**0.5)
print(r2_score(train_pool.get_label(), model.predict(train_pool)))
print(r2_score(data['xj60'], [data['xj60'].mean() for x in data['xj60']]))

13384.582230086866
40617.964993263944
0.8794131183174823
0.0


In [None]:

feat_imp =  model.get_feature_importance(prettified=True)

feat_imp.head(10)

Unnamed: 0,Feature Id,Importances
0,xj6.0,34.327782
1,xi4,10.117476
2,xj72.173,6.847765
3,xj31,6.494874
4,psu,5.513558
5,xj13,5.342631
6,xj6,3.927449
7,xj21b,3.745239
8,xj1.1.1,2.46584
9,region,1.355935


In [None]:
for x in  [ 'xj6.0',
 	'xi4',
 	'xj72.173',
 	'xj31',
 	'psu',
 	'xj13' ,
 	'xj6' ,	
 	'xj21b' ,	
 	'xj1.1.1' 	,
 	'region',]:
  print(x, map_columns[x]) 

xj6.0 Сколько у Вас подчиненных? Пожалуйста, посчитайте всех Ваших подчиненных, а не только тех, кто находится в Вашем непосредственном подчинении
xi4 Кем Вы себя считаете по национальности?
xj72.173 А сколько из них моложе 18 лет?
xj31 Насколько Вас беспокоит то, что Вы можете потерять работу?
psu Номер региона (первичная единица отбора)
xj13 Сколько человек работает на Вашем предприятии? Если не знаете точно, скажите примерно
xj6 У Вас есть подчиненные на этой работе?
xj21b Сколько всего календарных дней продолжался или продолжается этот отпуск?
xj1.1.1 Насколько Вы удовлетворены или не удовлетворены Вашей работой в целом?
region Регион
