In [185]:
import requests
import numpy as np
import pandas as pd

from scipy.sparse import hstack

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer

In [198]:
jobs = ['Data Scientist', 'Data Analyst']
vacancies = pd.DataFrame(columns=['employment', 'description', 'salary'])
ids = []

for job in jobs:
    url = 'https://api.hh.ru/vacancies'
    params = {'text' : job,
              'only_with_salary' : True,
              'page' : 0,
              'per_page' : 100}

    st = requests.get(url, params).json()

    for i in range(len(st['items'])):
        id = st['items'][i]['id']
        if id in ids:
            continue
        ids.append(id)
        vacancy = requests.get("https://api.hh.ru/vacancies/" + id + "?host=hh.ru").json()
        if vacancy['salary']['currency'] != 'RUR':
            continue
        if vacancy['salary']['from'] is None:
            salary = int(vacancy['salary']['to'])
        elif vacancy['salary']['to'] is None:
            salary = int(vacancy['salary']['from'])
        else:
            salary = (int(vacancy['salary']['from']) + int(vacancy['salary']['to'])) // 2
        vac_data = pd.DataFrame([[vacancy['employment']['id'],
                                  vacancy['description'],
                                  salary]],
                                columns=['employment', 'description', 'salary'])
        vacancies = vacancies.append(vac_data, ignore_index=True)

In [205]:
target_vacancies = pd.DataFrame(columns=['employment', 'description', 'salary'])
target_ids = ['38696758', '37080920']

for target_id in target_ids:
    vacancy = requests.get("https://api.hh.ru/vacancies/" + target_id + "?host=hh.ru").json()
    vac_data = pd.DataFrame([[vacancy['employment']['id'],
                              vacancy['description'],
                              'NaN']],
                            columns=['employment', 'description', 'salary'])
    target_vacancies = target_vacancies.append(vac_data, ignore_index=True)

In [207]:
vacancies.description = vacancies['description'].apply(lambda x: x.lower())
vacancies.description = vacancies['description'].replace('<[/\w]+>', ' ', regex = True)
vacancies.description = vacancies['description'].replace('[^ЁёА-яa-zA-Z0-9]', ' ', regex = True)

target_vacancies.description = target_vacancies['description'].apply(lambda x: x.lower())
target_vacancies.description = target_vacancies['description'].replace('<[/\w]+>', ' ', regex = True)
target_vacancies.description = target_vacancies['description'].replace('[^ЁёА-яa-zA-Z0-9]', ' ', regex = True)

In [208]:
target_vacancies

Unnamed: 0,employment,description,salary
0,full,мы занимается разработкой и внедрением програ...,
1,full,команда лаборатории интерактивной визуализаци...,


In [217]:
vectorizer = TfidfVectorizer(min_df=5)
X_train_vec = vectorizer.fit_transform(vacancies['description'])
X_test_vec = vectorizer.transform(target_vacancies['description'])

In [218]:
enc = DictVectorizer()
X_train_categ = enc.fit_transform(vacancies[['employment']].to_dict('records'))
X_test_categ = enc.transform(target_vacancies[['employment']].to_dict('records'))

In [219]:
X_for_train = hstack([X_train_vec, X_train_categ])
X_for_test = hstack([X_test_vec, X_test_categ])

In [220]:
ridge = Ridge(alpha=1, random_state=241)

In [221]:
y_for_train = vacancies['salary'].values

In [222]:
ridge.fit(X_for_train, y_for_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=241, solver='auto', tol=0.001)

In [223]:
ridge.predict(X_for_test)

array([193585.63505216, 178232.1434522 ])