# Модель для предсказания уровня зарплатных ожиданий сотрудников

## Импорты

In [None]:
!pip install catboost

In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

## Исходные данные

In [3]:
raw_data = pd.read_excel(
    '/content/drive/My Drive/Colab Notebooks/I_DIPLOMA/data/raw_data.xlsx'
)

In [4]:
raw_data.head(3)

Unnamed: 0.1,Unnamed: 0,entrydate,title,position,gender,city,age,salary,experience,last_job,updated,link
0,0,2020-04-08,"Резюме Менеджер по продажам мебели, дизайнер-к...",Менеджер по продажам,Женщина,Москва,31,70,8.0,ООО ИНТЕХ,2020-04-08,https://hh.ru/resume/840fd6300000a252640039ed1...
1,1,2020-04-08,Резюме Специалист по работе с клиентами,Клиентский менеджер,Мужчина,Липецк,40,90,15.0,МДК,2020-04-08,https://hh.ru/resume/66498670000716a4250039ed1...
2,2,2020-04-08,"Резюме Супервайзер торговых представителей, ст...",Супервайзер,Мужчина,Самара,34,50,7.0,Ютф-дистрибьютор,2020-04-08,https://hh.ru/resume/70e7817700035195b70039ed1...


In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3178 entries, 0 to 3177
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  3178 non-null   int64         
 1   entrydate   3178 non-null   datetime64[ns]
 2   title       3178 non-null   object        
 3   position    3178 non-null   object        
 4   gender      3178 non-null   object        
 5   city        3178 non-null   object        
 6   age         3178 non-null   int64         
 7   salary      3178 non-null   int64         
 8   experience  2522 non-null   float64       
 9   last_job    3178 non-null   object        
 10  updated     3178 non-null   datetime64[ns]
 11  link        3178 non-null   object        
dtypes: datetime64[ns](2), float64(1), int64(3), object(6)
memory usage: 298.1+ KB


In [6]:
raw_data.columns

Index(['Unnamed: 0', 'entrydate', 'title', 'position', 'gender', 'city', 'age',
       'salary', 'experience', 'last_job', 'updated', 'link'],
      dtype='object')

## Предобработка данных

In [7]:
def data_prep(df_input):
  """Обрабатывает исходные данные"""
  df_output = df_input.copy()
  df_output['experience'] = df_output['experience'].fillna(
                            df_output['experience'].mean())
  df_output['experience'] = df_output['experience'].astype(np.int)
  df_output.drop([
                  'Unnamed: 0',
                  'title',
                  'entrydate',
                  'last_job',
                  'updated',
                  'link'],
                 axis=1,
                 inplace=True)
  
  return df_output

In [8]:
# Посмотрим на подготовленные данные
data = data_prep(raw_data)
data.head(3)

Unnamed: 0,position,gender,city,age,salary,experience
0,Менеджер по продажам,Женщина,Москва,31,70,8
1,Клиентский менеджер,Мужчина,Липецк,40,90,15
2,Супервайзер,Мужчина,Самара,34,50,7


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3178 entries, 0 to 3177
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   position    3178 non-null   object
 1   gender      3178 non-null   object
 2   city        3178 non-null   object
 3   age         3178 non-null   int64 
 4   salary      3178 non-null   int64 
 5   experience  3178 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 149.1+ KB


## Построение модели

In [26]:
# Гиперпараметры
RANDOM_SEED = 42
VERSION = 11
VAL_SIZE = 0.2
N_FOLDS = 5
ITERATIONS = 2000
LR = 0.01

In [18]:
def train_model(data):
  """Обучает модель регрессии"""
  X = data.drop(['salary'], axis=1)
  y = data['salary'].values

  X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=VAL_SIZE, shuffle=True,
      random_state=RANDOM_SEED
      )
  
  # Указание на категориальные признаки
  cat_features_ids = np.where(
      X_train.apply(pd.Series.nunique
                    ) < 3000)[0].tolist()

  # Обучение модели
  model = CatBoostRegressor(
      iterations=ITERATIONS, 
      learning_rate=LR, 
      random_seed=RANDOM_SEED,
      eval_metric='MAPE', 
      custom_metric=['R2', 'MAE']
      )
  
  model.fit(
      X_train, y_train, 
      cat_features=cat_features_ids,
      eval_set=(X_test, y_test), 
      verbose_eval=100, 
      use_best_model=True, 
      plot=True
      )
  
  return model

In [27]:
# Сериализация модели для развертки на сервере сервиса
model = train_model(data)
with open('jsp_model.pkl', 'wb') as output:
  pickle.dump(model, output)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.3620724	test: 0.3679178	best: 0.3679178 (0)	total: 25.9ms	remaining: 51.7s
100:	learn: 0.2909723	test: 0.2815770	best: 0.2815770 (100)	total: 456ms	remaining: 8.57s
200:	learn: 0.2658114	test: 0.2479643	best: 0.2479643 (200)	total: 780ms	remaining: 6.98s
300:	learn: 0.2561725	test: 0.2367744	best: 0.2367744 (300)	total: 1.14s	remaining: 6.45s
400:	learn: 0.2514951	test: 0.2316983	best: 0.2316983 (400)	total: 1.46s	remaining: 5.83s
500:	learn: 0.2481073	test: 0.2293485	best: 0.2293485 (500)	total: 1.79s	remaining: 5.36s
600:	learn: 0.2450921	test: 0.2276287	best: 0.2276287 (600)	total: 2.19s	remaining: 5.1s
700:	learn: 0.2431111	test: 0.2265776	best: 0.2265766 (691)	total: 2.52s	remaining: 4.67s
800:	learn: 0.2410722	test: 0.2259926	best: 0.2259598 (797)	total: 2.86s	remaining: 4.28s
900:	learn: 0.2392566	test: 0.2255247	best: 0.2255129 (898)	total: 3.31s	remaining: 4.04s
1000:	learn: 0.2376841	test: 0.2250952	best: 0.2250952 (1000)	total: 3.64s	remaining: 3.63s
1100:	learn:

In [38]:
predict = model.predict(['Супервайзер',	
                         'Мужчина',	
                         'Воронеж',	
                         30,	10])
print(int(predict)*1000)

51000


In [None]:
!pip freeze > requirements.txt

In [None]:
!ls

catboost_info  drive  jsp_model.pkl  requirements.txt  sample_data
