In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import spacy
import json
import scipy

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)

In [2]:
df_vacancy_iter = pd.read_csv('../data/raw/vacancy.csv', sep='|', chunksize=10000, iterator=True)
df = next(df_vacancy_iter)

  df = next(df_vacancy_iter)


In [3]:
df['hard_skill_name'] = [' '.join([s['hard_skill_name'] for s in json.loads(row)]) for row in json.loads(df['hardSkills'].to_json()).values()]
df['vacancy_name'] = df['vacancy_name'].fillna('')
df['position_requirements'] = df['position_requirements'].fillna('')
df['position_responsibilities'] = df['position_responsibilities'].fillna('')

In [4]:
df['all_text'] = df.apply(
    lambda row: ' '.join(
        filter(lambda x: pd.notna(x) and x != '',
               [row['vacancy_name'], row['position_requirements'],
                row['position_responsibilities'], row['hard_skill_name']])
    ),
    axis=1
)

In [5]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text(separator=' ')
    return clean_text

In [6]:
df['all_text'] = df['all_text'].apply(remove_html_tags)

  soup = BeautifulSoup(text, 'html.parser')


In [7]:
nlp = spacy.load("ru_core_news_sm")

In [8]:
%%time

df['cleaned_text']  = df['all_text'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(x) if
        not token.is_punct
        and not token.is_space
    )
)

CPU times: user 3min 20s, sys: 52.4 ms, total: 3min 20s
Wall time: 3min 20s


## CV

In [9]:
df_cv_iter = pd.read_csv('../data/raw/cv.csv', sep='|', chunksize=10000, on_bad_lines = 'skip', iterator=True)
df_cv = next(df_cv_iter)

In [10]:

work_experience_list = json.loads(df_cv['workExperienceList'].to_json())
work_experience = []

for row in work_experience_list.values():
    if row != '[]':
        current_formatted = []
        for s in json.loads(row):
            job_title = s.get('jobTitle', '')
            demands = s.get('demands', '')
            current_formatted.append(f"{job_title} {demands}")
        work_experience.append(' '.join(current_formatted))
    else:
        work_experience.append('')

        
edu_list = json.loads(df_cv['educationList'].to_json())
edu = []

for row in edu_list.values():
    if row != '[]':
        current_formatted = []
        for s in json.loads(row):
            faculty = s.get('faculty', '')
            qualification = s.get('qualification', '')
            current_formatted.append(f"{faculty} {qualification}")
        edu.append(' '.join(current_formatted))
    else:
        edu.append('')

In [11]:
df_cv['hardSkills'] = [' '.join([s['hardSkillName'] for s in json.loads(row)]) for row in json.loads(df_cv['hardSkills'].to_json()).values()]
df_cv['softSkills'] = [' '.join([s['softSkillName'] for s in json.loads(row)]) for row in json.loads(df_cv['softSkills'].to_json()).values()]
df_cv['workExperienceList'] = work_experience
df_cv['positionName'] = df_cv['positionName'].fillna('')
df_cv['educationList'] = edu

In [12]:
df_cv['all_text'] = df_cv.apply(
    lambda row: ' '.join(
        filter(lambda x: pd.notna(x) and x != '',
               [row['hardSkills'], row['softSkills'],
                row['workExperienceList'], row['positionName'],
               row['educationList']])
    ),
    axis=1
)

In [13]:
df_cv['all_text'] = df_cv['all_text'].apply(remove_html_tags)

  soup = BeautifulSoup(text, 'html.parser')


In [14]:
%%time

df_cv['cleaned_text']  = df_cv['all_text'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(x) if
        not token.is_punct
        and not token.is_space
    )
)

CPU times: user 1min 58s, sys: 7.86 ms, total: 1min 58s
Wall time: 1min 58s


## TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example data (replace this with your actual data)
cv_corpus = df_cv['cleaned_text'].to_list()
vacancy_corpus = df['cleaned_text'].to_list()

# Combine query and passage for vectorization
all_text = cv_corpus + vacancy_corpus

# Create the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(all_text)

# Get the TF-IDF embeddings for query and passage
cv_tfidf = tfidf_matrix[:len(cv_corpus)]
vacancy_tfidf = tfidf_matrix[len(cv_corpus):]

In [33]:
# scipy.sparse.save_npz("../data/data_vectors/cv_tfidf.npz", cv_tfidf)
# scipy.sparse.save_npz("../data/data_vectors/vacancy_tfidf.npz", vacancy_tfidf)

In [400]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between query and passage TF-IDF embeddings
similarity_scores = cosine_similarity(cv_tfidf, vacancy_tfidf)

# Print or use the similarity scores as needed
print(similarity_scores)

[[0.         0.         0.00290805 ... 0.02118609 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.03870266 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.05128472 0.01381252 0.01201967 ... 0.1222886  0.         0.0293709 ]
 [0.04902231 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [2]:
import pandas as pd

In [3]:
similarity_df = pd.read_csv('td_idf_similarity_scores.csv')

In [452]:
similarity_df = pd.DataFrame(similarity_scores)
similarity_df.to_csv('td_idf_similarity_scores.csv', index=False)

In [419]:
np.argsort(similarity_df.iloc[0,:])[-k:]

9995    6552
9996      60
9997    3026
9998     186
9999    3439
Name: 0, dtype: int64

In [440]:
df_cv.columns

Index(['id', 'candidateId', 'stateRegionCode', 'locality', 'localityName',
       'birthday', 'gender', 'age', 'positionName', 'dateCreate', 'dateModify',
       'publishedDate', 'academicDegree', 'worldskills',
       'worldskillsInspectionStatus', 'abilympicsInspectionStatus',
       'abilympicsParticipation', 'volunteersInspectionStatus',
       'volunteersParticipation', 'driveLicenses', 'experience',
       'professionsList', 'otherCertificates', 'narkCertificate',
       'narkInspectionStatus', 'codeExternalSystem', 'country',
       'educationList', 'additionalEducationList', 'hardSkills', 'softSkills',
       'workExperienceList', 'scheduleType', 'salary', 'busyType',
       'retrainingCapability', 'businessTrip', 'languageKnowledge',
       'relocation', 'innerInfo', 'all_text', 'cleaned_text'],
      dtype='object')

In [447]:
i = 1000
cv_corpus[i]

'стрессоустойчивость руководитель направление группа по административный лицензионный деятельность и информационный поддержка технический поддержка microsoft windows server sql server windows pro microsoft office 1c предприятие kaspersky security center криптопро csp фсс арм лпу гпб дилинг банк клиенты гарант проксима kerio control pfsense vpn active directory exchange vmware esxi hyper v zabbix asterisk urbackup raid онлайн спринтер такском офд такском касса эдо контур диадок контур маркировка мдлп емиас егисз фрмо hikvision ivms-4200 medins amb doc этп обслуживание парк арм оргтехники vpn канал на 3 офис закупка комплектующих расходный материал хранение тмц инвентаризация замена фискальный накопитель обновление прайс лист кассовый аппарат ярус м2100ф mspos d ф sunmi p2pro установка лимит контроль расход выдача sim карта корпоративный мобильный связь мтс обновление контент корпоративный сайт на cms 1с битрикс участие в тендер загрузка документ на этп подача заявление через госуслуга п

In [448]:
k = 3
topk = np.argsort(similarity_scores[i])[-k:]
df['cleaned_text'][topk]

8345                                                                                                                                                                                                                                                                     администратор вычислительный сеть опыт администрирование линейка windows ad ts gpo rdp dhcp dns ms sql 1c опыт настройка локальный сеть сетевой оборудование знание основа прокладка лвс удалить доступ администрирование windows администрирование инфраструктура windows ad ts gpo rdp dhcp dns ms sql 1c администрирование система виртуализации ms hyper v обеспечение работа активный сетевой оборудование корпоративный антивирусный система создание архивный копия мониторинг инфраструктура компания организация доступ удалённый пользователь vpn
7784    системный администратор администрирование сервер на база microsoft windows linux администрирование и настройка dns dhcp active directory dfs.настройка ip телефония на база mango office на

## rubert-tiny2

In [16]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('cointegrated/rubert-tiny2')

rubert_cv = model.encode(cv_corpus)
rubert_vacancy = model.encode(vacancy_corpus)

In [24]:
# np.save("../data/data_vectors/cv_rubert", rubert_cv)
np.save("../data/data_vectors/vacancy_rubert", rubert_vacancy)

In [455]:
similarity_scores_bert = cosine_similarity(rubert_cv, rubert_vacancy)

In [456]:
similarity_df_bert = pd.DataFrame(similarity_scores_bert)
similarity_df_bert.to_csv('bert_similarity_scores.csv', index=False)

In [457]:
k = 3
topk = np.argsort(similarity_scores_bert[i])[-k:]
df['cleaned_text'][topk]

5939                                                                                                                                                                                                                                                                                                                                                                                                                                       тестировщик наш стек backend php 7.4 8.1 laravel framework mysql postgresql redis rabbitmq elasticsearch rest api graphql frontend typescript react react native next.js express redux storybook tools git docker jenkins jira confluence swagger discord allure miro figma monitoring kibana zabbix grafana чего мы ждем умение работать с postman charles или с иными аналогичный инструмент навык работа с инструмент разработчик в браузер командный строка знание и понимание основный принцип работа мобильный приложение и сайт уверенный значение техник тест дизайн и умение применя

## distiluse-base-multilingual-cased-v1

In [25]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

distiluse_cv = model.encode(cv_corpus)
distiluse_vacancy = model.encode(vacancy_corpus)

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.45k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

In [26]:
np.save("../data/data_vectors/cv_distiluse", distiluse_cv)
np.save("../data/data_vectors/vacancy_distiluse", distiluse_vacancy)

In [112]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores_distiluse = cosine_similarity(distiluse_cv, distiluse_vacancy)

In [115]:
k = 3
i = 1000
topk = np.argsort(similarity_scores_distiluse[i])[-k:]
df['cleaned_text'][topk]

2580                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [116]:
similarity_df_distiluse = pd.DataFrame(similarity_scores_distiluse)
similarity_df_distiluse.to_csv('distiluse_similarity_scores.csv', index=False)

## distiluse-base-multilingual-cased-v1

In [27]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

miniLM_cv = model.encode(cv_corpus)
miniLM_vacancy = model.encode(vacancy_corpus)

.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [28]:
np.save("../data/data_vectors/cv_miniLM", miniLM_cv)
np.save("../data/data_vectors/vacancy_miniLM", miniLM_vacancy)

In [118]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores_miniLM = cosine_similarity(miniLM_cv, miniLM_vacancy)

In [119]:
k = 3
i = 1000
topk = np.argsort(similarity_scores_miniLM[i])[-k:]
df['cleaned_text'][topk]

7408                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [120]:
similarity_df_miniLM = pd.DataFrame(similarity_scores_miniLM)
similarity_df_miniLM.to_csv('miniLM_similarity_scores.csv', index=False)