# Пример использования библиотеки для извлечения навыков их вакансий HH.ru

Ссылка на репозиторий с моделями:
https://github.com/explosion/spacy-models/releases

In [47]:
# Скачаем модель с помощью команды
#!python -m spacy download en_core_web_lg
#!python -m spacy download ru_core_news_lg

In [48]:
# Импортируем необходимые библиотеки
import spacy
from spacy.matcher import PhraseMatcher
from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor

In [49]:
# Загружаем модель и создаём экземпляр класса для извлечения навыков
nlp = spacy.load("ru_core_news_lg")
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [50]:
import os
import json
from bs4 import BeautifulSoup

vacancies = []
for fl in os.listdir("./crawler/hh/docs/vacancies/"):
    if fl == ".gitignore":
        continue
    fileName = "./crawler/hh/docs/vacancies/{}".format(fl)
    f = open(fileName, encoding="utf8")
    json_obj = json.loads(f.read())
    f.close()
    if 'errors' in json_obj:
        continue
    job_description = json_obj['name'] + "\n" + BeautifulSoup(json_obj['description'], "lxml").text
    try:
        annotations = skill_extractor.annotate(job_description)
        if not annotations['results']['full_matches']:
            continue
        print(json_obj['id'])
        vacancies.append({
            'id': json_obj['id'],
            'name': json_obj['name'],
            'description': BeautifulSoup(json_obj['description'], "lxml").text,
            'annotations': annotations,
        })
    except:
        print("An exception occurred")

90297395
90796785
90297974
90884742
90731182
90521796
90408241
90835386
89734190
An exception occurred
89084791
88605008
90299628
91000845
90458344
90753395
90056664
90951619
90321020
88708728
90266519
90057113
90986935
An exception occurred
88139805
An exception occurred
90139512
88847420
90797968
90145734
89261258
90080753
90940753
88086414
90393771
90679435
90860954
88117851
79470559
89775489
90563313
78667319
90580169
91000216
81857244
90287724
An exception occurred
82427066
82095780
89782925
90299656
90521949
90186350
90608955
90298533
90907970
90534181
90690698
90053027
An exception occurred
90384804
89803196
90237992
90457902
87825393
79123584
90845124
86312593
89716130
77462544
91000510
90846101
91001326
90922540
84375216
90200345
90390961
90432207
86842401
90489223
90951921
An exception occurred
69181729
90690555
91000856
90288937
90547943
90147846
An exception occurred
90846104
90521782
84630082
90520888
90944944
88572943
90058988
88582493
90215075
An exception occurred
89351

In [51]:
from mapping.replacements import *

technologies_data = []
for vacancy in vacancies:
    technologies = set()
    full_matches = vacancy['annotations']['results']['full_matches']
    for match in full_matches:
        technology = match['doc_node_value'].lower()
        if technologies not in REMOVE:
            technologies.add(technology)
    ngram_scored = vacancy['annotations']['results']['ngram_scored']
    for ngram in ngram_scored:
        technology = ngram['doc_node_value'].lower()
        if technologies not in REMOVE:
            technologies.add(technology)
    for key, values in REPLACEMENTS.items():
        for value in values:
            if value in technologies:
                technologies.remove(value)
                technologies.add(key)
    # Adding the processed list of technologies to the list
    technologies_data.append(sorted(list(technologies)))

In [52]:
from datasets import Dataset, DatasetDict
import pyarrow as pa

# Convert list of technologies to a PyArrow Array.
tech_array = pa.array(technologies_data)

# Generate table
columns = {
    'id': pa.array([vacancy['id'] for vacancy in vacancies]),
    'name': pa.array([vacancy['name'] for vacancy in vacancies]),
    'description': pa.array([vacancy['description'] for vacancy in vacancies]),
    'technologies': tech_array
}
table = pa.Table.from_pydict(columns)

dataset_dict = DatasetDict({
    'train': Dataset(table),
})
dataset_dict.push_to_hub('evilfreelancer/headhunter')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/evilfreelancer/headhunter/commit/4a226679e53e993e87c2bc87531eba3ba38b28df', commit_message='Upload dataset', commit_description='', oid='4a226679e53e993e87c2bc87531eba3ba38b28df', pr_url=None, pr_revision=None, pr_num=None)