# Модель для предсказания уровня зарплатных ожиданий сотрудников

## Импорты

In [None]:
!pip install catboost

In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

## Исходные данные

In [3]:
raw_data = pd.read_excel(
    '/content/drive/My Drive/Colab Notebooks/I_DIPLOMA/data/raw_data.xlsx'
)

In [4]:
raw_data.head(3)

Unnamed: 0.1,Unnamed: 0,entrydate,title,position,gender,city,age,salary,experience,last_job,updated,link
0,0,2020-04-08,"Резюме Менеджер по продажам мебели, дизайнер-к...",Менеджер по продажам,Женщина,Москва,31,70,8.0,ООО ИНТЕХ,2020-04-08,https://hh.ru/resume/840fd6300000a252640039ed1...
1,1,2020-04-08,Резюме Специалист по работе с клиентами,Клиентский менеджер,Мужчина,Липецк,40,90,15.0,МДК,2020-04-08,https://hh.ru/resume/66498670000716a4250039ed1...
2,2,2020-04-08,"Резюме Супервайзер торговых представителей, ст...",Супервайзер,Мужчина,Самара,34,50,7.0,Ютф-дистрибьютор,2020-04-08,https://hh.ru/resume/70e7817700035195b70039ed1...


In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3178 entries, 0 to 3177
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  3178 non-null   int64         
 1   entrydate   3178 non-null   datetime64[ns]
 2   title       3178 non-null   object        
 3   position    3178 non-null   object        
 4   gender      3178 non-null   object        
 5   city        3178 non-null   object        
 6   age         3178 non-null   int64         
 7   salary      3178 non-null   int64         
 8   experience  2522 non-null   float64       
 9   last_job    3178 non-null   object        
 10  updated     3178 non-null   datetime64[ns]
 11  link        3178 non-null   object        
dtypes: datetime64[ns](2), float64(1), int64(3), object(6)
memory usage: 298.1+ KB


In [6]:
raw_data.columns

Index(['Unnamed: 0', 'entrydate', 'title', 'position', 'gender', 'city', 'age',
       'salary', 'experience', 'last_job', 'updated', 'link'],
      dtype='object')

## Предобработка данных

In [19]:
def data_prep(df_input):
  """Обрабатывает исходные данные"""
  df_output = df_input.copy()
  df_output['experience'] = df_output['experience'].fillna(
                            df_output['experience'].mean())
  df_output['experience'] = df_output['experience'].astype(np.int)
  df_output.drop([
                  'Unnamed: 0',
                  'title',
                  'entrydate',
                  'last_job',
                  'updated',
                  'link'],
                 axis=1,
                 inplace=True)
  
  return df_output

In [20]:
# Посмотрим на подготовленные данные
data = data_prep(raw_data)
data.head(3)

Unnamed: 0,position,gender,city,age,salary,experience
0,Менеджер по продажам,Женщина,Москва,31,70,8
1,Клиентский менеджер,Мужчина,Липецк,40,90,15
2,Супервайзер,Мужчина,Самара,34,50,7


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3178 entries, 0 to 3177
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   position    3178 non-null   object
 1   gender      3178 non-null   object
 2   city        3178 non-null   object
 3   age         3178 non-null   int64 
 4   salary      3178 non-null   int64 
 5   experience  3178 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 149.1+ KB


## Построение модели

In [22]:
# Гиперпараметры
RANDOM_SEED = 42
VERSION = 11
VAL_SIZE = 0.33
N_FOLDS = 5
ITERATIONS = 2000
LR = 0.05

In [23]:
def train_model(data):
  """Обучает модель регрессии"""
  X = data.drop(['salary'], axis=1)
  y = data['salary'].values

  X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=VAL_SIZE, shuffle=True,
      random_state=RANDOM_SEED
      )
  
  # Указание на категориальные признаки
  cat_features_ids = np.where(
      X_train.apply(pd.Series.nunique
                    ) < 3000)[0].tolist()

  # Обучение модели
  model = CatBoostRegressor(
      iterations=ITERATIONS, 
      learning_rate=LR, 
      random_seed=RANDOM_SEED,
      eval_metric='MAPE', 
      custom_metric=['R2', 'MAE']
      )
  
  model.fit(
      X_train, y_train, 
      cat_features=cat_features_ids,
      eval_set=(X_test, y_test), 
      verbose_eval=100, 
      use_best_model=True, 
      plot=True
      )
  
  return model

In [24]:
# Сериализация модели для развертки на сервере сервиса
model = train_model(data)
with open('jsp_model.pkl', 'wb') as output:
  pickle.dump(model, output)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.3573194	test: 0.3609453	best: 0.3609453 (0)	total: 62.8ms	remaining: 2m 5s
100:	learn: 0.2493372	test: 0.2473624	best: 0.2472526 (98)	total: 444ms	remaining: 8.35s
200:	learn: 0.2370218	test: 0.2434496	best: 0.2433502 (194)	total: 804ms	remaining: 7.19s
300:	learn: 0.2290144	test: 0.2427684	best: 0.2421850 (261)	total: 1.14s	remaining: 6.46s
400:	learn: 0.2205756	test: 0.2433523	best: 0.2421850 (261)	total: 1.45s	remaining: 5.8s
500:	learn: 0.2138758	test: 0.2432434	best: 0.2421850 (261)	total: 1.77s	remaining: 5.31s
600:	learn: 0.2078040	test: 0.2434433	best: 0.2421850 (261)	total: 2.22s	remaining: 5.17s
700:	learn: 0.2019734	test: 0.2435249	best: 0.2421850 (261)	total: 2.54s	remaining: 4.7s
800:	learn: 0.1968290	test: 0.2431425	best: 0.2421850 (261)	total: 2.85s	remaining: 4.26s
900:	learn: 0.1911533	test: 0.2433707	best: 0.2421850 (261)	total: 3.27s	remaining: 3.99s
1000:	learn: 0.1856016	test: 0.2433974	best: 0.2421850 (261)	total: 3.59s	remaining: 3.58s
1100:	learn: 0.

In [27]:
!pip freeze > requirements.txt

In [28]:
!ls

catboost_info  drive  jsp_model.pkl  requirements.txt  sample_data
