<a href="https://colab.research.google.com/github/AndreyKaBelka/MLHomeWork/blob/master/Collab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade spacy
!python -m spacy download ru_core_news_md

Collecting ru-core-news-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.2.0/ru_core_news_md-3.2.0-py3-none-any.whl (43.0 MB)
[K     |████████████████████████████████| 43.0 MB 2.1 MB/s 
[?25hCollecting pymorphy2>=0.9
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 1.9 MB/s 
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 10.1 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2, ru-core-news-md
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844 ru-core-news-md-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_md')


In [None]:
import json
import re
from multiprocessing import Pool
from string import punctuation

import requests
from spacy import load
from spacy.lang.ru import RussianDefaults

VACANCY_URL = 'https://api.hh.ru/vacancies'
nlp = load('ru_core_news_md')
ADDITIONAL_PUNC = ['—', ':-', '-', *punctuation]
STOP_WORDS = RussianDefaults.stop_words


def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))


def get_vacancies(page):
    params = {
        'text': 'NAME:Java',
        'page': page,
        'per_page': 100
    }
    req = requests.get(VACANCY_URL, params)
    req.close()
    return json.loads(req.content.decode())["items"]


def get_vac(vac_id):
    req = requests.get(f"{VACANCY_URL}/{vac_id}")
    req.close()
    return json.loads(req.content.decode())


def get_vacs(max_page):
    vacancies = []
    for page in range(max_page):
        vacancies.extend(get_vacancies(page))
    return vacancies


def prepare_vacancy_description(description):
    description = description.replace("quot", "")
    description = re.sub("\s+", " ", re.sub("\d+", "", description))
    description = re.sub("\s+", " ", re.sub("<[^>]*>", "", description))
    tokens = [token.lemma_ for token in nlp(description)]

    tokens1 = []
    for token in tokens:
        if token not in STOP_WORDS and token not in ADDITIONAL_PUNC:
            tokens1.append(re.sub("\W", "", token))

    return [token for token in tokens1 if len(token) > 1]


def get_prepared_all_vacancies_from_hh(vacancy_ids):
    results = []
    for vac_id in vacancy_ids:
        try:
          desc = get_vac(vac_id)['description']
          prepared = prepare_vacancy_description(desc)
          results.append(prepared)
        except Exception as e:
          pass
    return results


if __name__ == '__main__':
    vacs = get_vacs(10)
    vac_ids = [vac['id'] for vac in vacs]
    vac_ids_parts = list(split(vac_ids, 5))
    res = []
    with Pool(5) as pool:
        with open('data.txt', 'w') as file:
          file.write(str(pool.map(get_prepared_all_vacancies_from_hh, vac_ids_parts)))
