In [35]:
import pandas as pd
import numpy as np
import string

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [36]:
df = pd.read_csv('resume_data.csv')
df.drop_duplicates(inplace=True)

In [38]:
stop_words = set(stopwords.words('russian'))  # Используйте соответствующий язык
stemmer = SnowballStemmer('russian')  # Используйте соответствующий языкэ
def preprocess_text(text):
    # Приведем к нижнему регистру
    text = text.lower()
    # Удалим пунктуацию
    text = "".join([char for char in text if char not in string.punctuation])
    # Токенизация
    words = word_tokenize(text)
    # Удалим стоп-слова и применим стемминг
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    
    return words



In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

job_description = """
Системный администратор
Знание и опыт работы с серверными операционными системами Windows Server и Linux.
Умение настраивать сетевое оборудование.
Опыт работы с системами виртуализации и облачными сервисами.
Способность быстро решать проблемы с IT-инфраструктурой.
Навыки работы с базами данных и системами резервного копирования.
"""

text_columns = ['Position', 'Specializations', 'Previous_Positions', 'Languages', 'Education', 'About_Me', 'Skills']

resume_texts = df[text_columns].fillna('').agg(' '.join, axis=1).tolist()
all_texts = [job_description] + resume_texts
# all_texts = [preprocess_text(doc) for doc in all_texts]
# all_texts= [" ".join(doc) for doc in all_texts]

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix_sysadmin_modified = tfidf_vectorizer.fit_transform(all_texts)

cosine_similarities_tfidf = cosine_similarity(tfidf_matrix_sysadmin_modified[0:1],
                                              tfidf_matrix_sysadmin_modified[1:]).flatten()

resume_similarities_tfidf = dict(enumerate(cosine_similarities_tfidf))

sorted_resume_similarities_tfidf = sorted(resume_similarities_tfidf.items(), key=lambda item: item[1], reverse=True)

sorted_resume_similarities_tfidf

[(4370, 0.1822537109446224),
 (4427, 0.1822537109446224),
 (359, 0.1814582571844196),
 (1185, 0.167091878148945),
 (1613, 0.167091878148945),
 (1339, 0.1517321718494116),
 (1066, 0.14168032429384336),
 (2203, 0.14168032429384336),
 (0, 0.13863365099652675),
 (878, 0.13713678945565017),
 (1278, 0.13402344034895164),
 (1020, 0.12925062976427612),
 (1350, 0.12925062976427612),
 (1470, 0.12925062976427612),
 (391, 0.12370081513315381),
 (4472, 0.12324036697863898),
 (3013, 0.11929722697167877),
 (4046, 0.11929722697167877),
 (4651, 0.11867890721217506),
 (1299, 0.11780961371568188),
 (268, 0.11523513511783762),
 (303, 0.11523513511783762),
 (496, 0.11492231068025222),
 (651, 0.11492231068025222),
 (4678, 0.1113336879520386),
 (971, 0.10999517572794403),
 (490, 0.10848760469709087),
 (646, 0.10848760469709087),
 (2439, 0.10711932894755084),
 (3241, 0.10711932894755084),
 (386, 0.10442479515870423),
 (3592, 0.1028139094938835),
 (2276, 0.10124658173830012),
 (3527, 0.09958830772651157),
 (41

In [68]:
# Выберем только ключевые поля для первичной фильтрации
key_columns = ['Position', 'Specializations', 'Previous_Positions']
key_texts = df[key_columns].fillna('').agg(' '.join, axis=1).tolist()
all_texts = [preprocess_text(doc) for doc in all_texts]
# all_texts= [" ".join(doc) for doc in all_texts]
# Векторизация ключевых полей
tfidf_vectorizer_keys = TfidfVectorizer(ngram_range=(1,2))
tfidf_matrix_keys = tfidf_vectorizer_keys.fit_transform(['системный администратор'] + key_texts)

# Вычисление косинусного сходства для ключевых полей
cosine_similarities_keys = cosine_similarity(tfidf_matrix_keys[0:1], tfidf_matrix_keys[1:]).flatten()

# Отбор резюме с сходством выше порога (например, пороговое значение может быть установлено на уровне 0.1)
threshold = 0.1
preselected_resume_indices = [(index, similarity) for index, similarity in enumerate(cosine_similarities_keys) if similarity > threshold]
sorted(preselected_resume_indices, key=lambda item: item[1], reverse=True)


[(0, 0.8560573714706771),
 (3747, 0.8560573714706771),
 (1020, 0.8367948828738325),
 (1350, 0.8367948828738325),
 (1470, 0.8367948828738325),
 (2276, 0.8000686934463266),
 (155, 0.7951884087151567),
 (4673, 0.7586763767030187),
 (1339, 0.7541173006654682),
 (69, 0.7499979758885262),
 (112, 0.7499979758885262),
 (1118, 0.7161947133635714),
 (1705, 0.7161947133635714),
 (1762, 0.7161947133635714),
 (3486, 0.714687832717636),
 (4625, 0.714687832717636),
 (2354, 0.6987013054523641),
 (4349, 0.6987013054523641),
 (4404, 0.6987013054523641),
 (1066, 0.6673887856943769),
 (2203, 0.6673887856943769),
 (2414, 0.6652230095381932),
 (3423, 0.6652230095381932),
 (2338, 0.6334822626768195),
 (4311, 0.6334822626768195),
 (4370, 0.596226100199497),
 (4427, 0.596226100199497),
 (810, 0.5906572539744138),
 (1185, 0.5700009018970741),
 (1613, 0.5700009018970741),
 (2035, 0.545588668830196),
 (3829, 0.5424102104856495),
 (971, 0.5422135036073212),
 (4651, 0.5399808000083213),
 (2118, 0.5119870216588283),

In [None]:
from gensim.models import FastText
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Предполагается, что у вас есть предобученная модель FastText в файле 'fasttext.model.bin'
# Загружаем предобученную модель FastText
model = KeyedVectors.load_word2vec_format('cc.ru.300.bin', binary=True)

# Функция для получения вектора документа путем усреднения векторов слов
def document_vector(doc):
    # Удаляем слова, которых нет в модели
    words = [word for word in doc.split() if word in model.vocab]
    if len(words) == 0:
       return np.zeros(model.vector_size)
    else:
       # Усредняем векторы слов, чтобы получить вектор документа
       return np.mean(model[words], axis=0)

# Преобразуем описание вакансии и резюме в векторы
job_vec = document_vector(job_description)
resume_vecs = np.array([document_vector(doc) for doc in resume_texts])

# Вычисляем косинусное сходство между вакансией и каждым резюме
cosine_similarities = cosine_similarity([job_vec], resume_vecs).flatten()

# Создаем словарь сопоставлений индекс резюме -> сходство
resume_similarities = dict(enumerate(cosine_similarities))

# Сортируем резюме по убыванию сходства
sorted_resume_similarities = sorted(resume_similarities.items(), key=lambda item: item[1], reverse=True)

sorted_resume_similarities

In [9]:
import random

seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)


In [10]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from tqdm import tqdm

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_texts)]
model = Doc2Vec(vector_size=100, alpha=0.025, min_alpha=0.00025, min_count=1, dm=0, seed=42)
model.build_vocab(documents)

for epoch in tqdm(range(10), desc="Training epochs"):
    print(f'Iteration {epoch}')
    model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

    model.alpha -= 0.002
    model.min_alpha = model.alpha

Training epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration 0


Training epochs:  10%|█         | 1/10 [00:08<01:13,  8.18s/it]

Iteration 1


Training epochs:  20%|██        | 2/10 [00:16<01:04,  8.10s/it]

Iteration 2


Training epochs:  30%|███       | 3/10 [00:24<00:55,  7.99s/it]

Iteration 3


Training epochs:  40%|████      | 4/10 [00:31<00:47,  7.89s/it]

Iteration 4


Training epochs:  50%|█████     | 5/10 [00:39<00:39,  7.85s/it]

Iteration 5


Training epochs:  60%|██████    | 6/10 [00:48<00:32,  8.07s/it]

Iteration 6


Training epochs:  70%|███████   | 7/10 [00:55<00:23,  7.92s/it]

Iteration 7


Training epochs:  80%|████████  | 8/10 [01:03<00:15,  7.86s/it]

Iteration 8


Training epochs:  90%|█████████ | 9/10 [01:11<00:07,  7.79s/it]

Iteration 9


Training epochs: 100%|██████████| 10/10 [01:18<00:00,  7.89s/it]


In [12]:

resume_texts_series = df[text_columns].fillna('').agg(' '.join, axis=1)
resume_vecs_series = resume_texts_series.apply(lambda x: model.infer_vector(word_tokenize(x.lower()), epochs=20))

In [13]:

job_vector = model.infer_vector(word_tokenize(job_description.lower()), epochs=20)
# resume_vecs = [model.infer_vector(word_tokenize(text.lower())) for text in resume_texts]

#cosine_similarities_doc2vec = [cosine_similarity([job_vector], [resume_vec])[0][0] for resume_vec in resume_vecs]
cosine_similarities_doc2vec = resume_vecs_series.apply(lambda x: cosine_similarity([job_vector], [x])[0][0])
sorted_similarities = sorted(enumerate(cosine_similarities_doc2vec, 1), key=lambda x: x[1], reverse=True)

# Вывод результатов
#print(sorted_similarities)


model.docvecs.most_similar([job_vector]) 

  model.docvecs.most_similar([job_vector])


[(3744, 0.5628329515457153),
 (2173, 0.5476540327072144),
 (3688, 0.5454566478729248),
 (3482, 0.5423063039779663),
 (3960, 0.5381333231925964),
 (700, 0.532794713973999),
 (2087, 0.531611442565918),
 (3276, 0.5315738320350647),
 (678, 0.5302934646606445),
 (2105, 0.5277499556541443)]

In [14]:
sorted_similarities

[(1117, 0.9829046726226807),
 (1505, 0.9808845520019531),
 (4771, 0.9804688096046448),
 (980, 0.9802522659301758),
 (48, 0.9794981479644775),
 (648, 0.9794855117797852),
 (3188, 0.9792640209197998),
 (3904, 0.9792094230651855),
 (35, 0.9791171550750732),
 (966, 0.9785678386688232),
 (2203, 0.9785106182098389),
 (2563, 0.9784299731254578),
 (3811, 0.9783999919891357),
 (1293, 0.978130578994751),
 (2105, 0.9781256914138794),
 (557, 0.9781102538108826),
 (119, 0.9780694246292114),
 (34, 0.9780210852622986),
 (1423, 0.9777166843414307),
 (1572, 0.9776190519332886),
 (434, 0.9776048064231873),
 (3128, 0.9775843620300293),
 (3442, 0.9774499535560608),
 (569, 0.9774157404899597),
 (1043, 0.9773767590522766),
 (848, 0.9773274660110474),
 (854, 0.9773245453834534),
 (2140, 0.9773141741752625),
 (1306, 0.9773106575012207),
 (2700, 0.9771040081977844),
 (1936, 0.9770858883857727),
 (1674, 0.9770616888999939),
 (4484, 0.9769256711006165),
 (1366, 0.9768769145011902),
 (3235, 0.9768640398979187),
 

In [18]:
vec1 = model.infer_vector(word_tokenize(resume_texts_series.loc[802].lower()), epochs=20)
vec2 = model.infer_vector(word_tokenize(job_description.lower()), epochs=20)
cosine_similarity([vec1], [vec2])[0][0]


0.89810723

In [81]:
from transformers import AutoTokenizer, AutoModel
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

sentences = ['Привет! Как твои дела?',
             'А правда, что 42 твое любимое число?']
#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors='pt')
#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

KeyboardInterrupt: 

In [77]:
sentence_embeddings.shape

torch.Size([2, 1024])

In [80]:
encoded_input = tokenizer(["Системный администратор ббебебебе с бабабабаба"], padding=True, truncation=True, max_length=24, return_tensors='pt')
#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)