In [1]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
import pickle
import tqdm
import pymorphy2
import logging
import os
from string import punctuation
from nltk import TreebankWordTokenizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
from sqlalchemy import create_engine
from src.config import conn_string

In [2]:
morph = pymorphy2.MorphAnalyzer()
logging.basicConfig(level="INFO")

In [3]:
def get_lines(conn_string):
    """
    Подключается к БД и выкачивает вакансии
    """
    logging.info("Подгружаю данные из базы")
    engine = create_engine(conn_string)

    df = pd.read_sql_table('vacancy', engine)
    logging.info(df.head)
    lines = df.vacdescription.tolist()
    vacids = df.vacid.tolist()
    return lines, vacids

In [4]:
def txt_pipe(lines):
    logging.info("Готовлю корпус")
    ru_stop_words = stopwords.words('russian')
    lines_tok = [TreebankWordTokenizer().tokenize(x) for x in lines]
    lines_tok = [[x for x in el if x not in punctuation] for el in lines_tok]
    u_norm = [[morph.parse(x)[0][2] for x in el] for el in tqdm(lines_tok)]
    u_norm = [[x for x in el if x not in ru_stop_words] for el in tqdm(u_norm)]
    corpus = [' '.join(x) for x in u_norm]
    return corpus

In [5]:

def l2_norm(x):
    return np.sqrt(np.sum(x ** 2))


def div_norm(x):
    norm_value = l2_norm(x)
    if norm_value > 0:
        return x * (1.0 / norm_value)
    else:
        return x

In [6]:
def get_vacancy_vectors(vacids, corpus):
    """
    Получает вектора профилей пользователей из фасттекста
    """

    from gensim.models import FastText
    
    vacancy_vectors = {}
    logging.info("Подгружаем обученную модель FastText")
    fasttext_pth = os.path.join('..','wvmodel','cc.ru.300.bin')
    fast_text = FastText.load_fasttext_format(fasttext_pth).wv
    
    logging.info("Собираем векторы предложений")
    for x in tqdm((vacids, corpus)):

        text = x[1].split()
        text.append('\n')
        matrix = np.zeros((300,), dtype = 'float32')

        for word in text:
            matrix += div_norm(fast_text.word_vec(word))

        vacancy_vectors[x[0]] = matrix

In [7]:

def get_similarities(target_user, candidates):
    """Получает косинусные сходства сотрудника и кандидатов"""

    tu_sims = {}
    for vacid in vacids:
        tu_sims[candidates] = cosine_similarity(
            vacancy_vectors[vacid],
            user_vectors
        )[0][0]

    return tu_sims

In [10]:
lines, vacids = get_lines(conn_string)

INFO:root:Подгружаю данные из базы
INFO:root:<bound method NDFrame.head of           id                 created_at                 updated_at   vacid  \
0     416760 2020-12-17 20:17:18.625748 2020-12-17 20:17:18.625881   85441   
1     416761 2020-12-17 20:17:18.625748 2020-12-17 20:17:18.625881   99242   
2     416762 2020-12-17 20:17:18.625748 2020-12-17 20:17:18.625881  101976   
3     416763 2020-12-17 20:17:18.625748 2020-12-17 20:17:18.625881  101988   
4     416764 2020-12-17 20:17:18.625748 2020-12-17 20:17:18.625881  392309   
...      ...                        ...                        ...     ...   
3408  420168 2020-12-17 20:17:18.625748 2020-12-17 20:17:18.625881  696912   
3409  420169 2020-12-17 20:17:18.625748 2020-12-17 20:17:18.625881  701151   
3410  420170 2020-12-17 20:17:18.625748 2020-12-17 20:17:18.625881  706219   
3411  420171 2020-12-17 20:17:18.625748 2020-12-17 20:17:18.625881  706232   
3412  420172 2020-12-17 20:17:18.625748 2020-12-17 20:17:18.625881 

In [11]:
corpus = txt_pipe(lines)

INFO:root:Готовлю корпус


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3413.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3413.0), HTML(value='')))




In [13]:
# with open(os.path.join('..','data','corpus.pkl'), 'wb') as f:
#     pickle.dump(corpus, f)

# with open(os.path.join('..','data','vacids.pkl'), 'wb') as f:
#     pickle.dump(vacids, f)

In [14]:
with open(os.path.join('..','data','vacids.pkl'), 'rb') as f:
    vacids = pickle.load(f)

with open(os.path.join('..','data','corpus.pkl'), 'rb') as f:
    corpus = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: '../data/vacids.pkl'

In [12]:
vvect = get_vacancy_vectors(vacids, corpus)

INFO:root:Подгружаем обученную модель FastText
  fast_text = FastText.load_fasttext_format(fasttext_pth).wv
INFO:gensim.models._fasttext_bin:loading 1888423 words for fastText model from ../wvmodel/cc.ru.300.bin
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.word2vec:Updating model with new vocabulary
INFO:gensim.models.word2vec:New added 1888423 unique words (50% of original 3776846) and increased the count of 1888423 pre-existing words (50% of original 3776846)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 1888423 items
INFO:gensim.models.word2vec:sample=0.0001 downsamples 510 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 968832418 word corpus (120.5% of prior 803768482)
INFO:gensim.models.fasttext:loaded (3888423, 300) weight matrix for fastText model from ../wvmodel/cc.ru.300.bin
INFO:root:Собираем векторы предложений


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [21]:
def get_text_vector(user_text):
    """ Формирует вектор из ключевых слов пользователя """
    from gensim.models import FastText
    

    logging.info("Подгружаем обученную модель FastText")
    fasttext_pth = os.path.join('..','wvmodel','cc.ru.300.bin')
    fast_text = FastText.load_fasttext_format(fasttext_pth).wv
    
    logging.info("Собираем векторы предложений")

    user_text = user_text.split()
    user_text.append('\n')
    matrix = np.zeros((300,), dtype = 'float32')

    for word in user_text:
        matrix += div_norm(fast_text.word_vec(word))

    return matrix

In [22]:
def get_user_text_vectors(user_txt_dict):
    """ Формирует векторы для всех пользователей и записывает в словарь """

    user_text_vectors = {}
    for user_id, keywords in user_txt_dict.items():
        user_text_vectors[user_id] = get_text_vector(keywords)

    return user_text_vectors

In [23]:
text = "python git data science machine learning"
text = {1:text}
get_user_text_vectors(text,)

INFO:root:Подгружаем обученную модель FastText
  fast_text = FastText.load_fasttext_format(fasttext_pth).wv
INFO:gensim.models._fasttext_bin:loading 1888423 words for fastText model from ../wvmodel/cc.ru.300.bin
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.word2vec:Updating model with new vocabulary
INFO:gensim.models.word2vec:New added 1888423 unique words (50% of original 3776846) and increased the count of 1888423 pre-existing words (50% of original 3776846)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 1888423 items
INFO:gensim.models.word2vec:sample=0.0001 downsamples 510 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 968832418 word corpus (120.5% of prior 803768482)
INFO:gensim.models.fasttext:loaded (3888423, 300) weight matrix for fastText model from ../wvmodel/cc.ru.300.bin
INFO:root:Собираем векторы предложений


{1: array([-4.40742821e-04,  9.24402326e-02,  1.38517320e-01, -7.05969989e-01,
         1.31782681e-01, -1.12261623e-02,  2.37889171e-01, -2.52624154e-01,
         3.19628924e-01,  3.96308228e-02, -1.35322995e-02, -7.23003745e-02,
         4.29560453e-01,  2.28546590e-01,  6.69134259e-02, -2.19147444e-01,
        -8.80413875e-03, -3.29314351e-01,  6.20986879e-01,  2.24975705e-01,
         3.55819836e-02,  2.15868458e-01, -3.26183915e-01, -4.68632907e-01,
        -7.29218796e-02, -1.01908818e-01, -2.23477464e-02,  7.76955411e-02,
        -4.04997647e-01,  2.16299593e-02,  1.23945780e-01, -8.45732167e-02,
         2.79955894e-01,  1.68110784e-02,  4.45793085e-02,  1.52397349e-01,
         1.25535414e-01, -2.32889101e-01,  1.98648259e-01,  1.75265267e-01,
        -1.39682949e-01,  1.19266458e-01, -2.35276148e-02,  1.89557001e-01,
         8.03967044e-02,  6.04265742e-02,  1.12643361e-01,  6.19361550e-02,
        -7.17460215e-02,  2.55582482e-03,  1.69044033e-01,  7.41387308e-02,
         

In [None]:
# with open(os.path.join('..','data','fast_text.pkl'), 'wb') as f:
#     pickle.dump(fast_text, f)

In [None]:
vacancy_vectors = {}
for x in tqdm(list(zip(vacids, corpus))):
    text = x[1].split()
    text.append('\n')
    matrix = np.zeros((300,), dtype = 'float32')
    for word in text:
        matrix += div_norm(fast_text.word_vec(word))
    vacancy_vectors[x[0]] = matrix

In [None]:
text = "python git data science machine learning"
text = "сми репутация сторителлинг фактчекинг пресс-релиз коммуникация pr журналист москва"
text = text.split()
text.append('\n')
matrix = np.zeros((300,), dtype = 'float32')
for word in text:
    matrix += div_norm(fast_text.word_vec(word))

In [None]:
tu_sims = {}
for vacid in tqdm(vacids):
    tu_sims[vacid] = cosine_similarity(vacancy_vectors[vacid].reshape(1,-1),
                                       matrix.reshape(1,-1))[0][0]

In [None]:
tu_sorted = sorted(tu_sims.items(), key=lambda x:x[1], reverse=True)
tu_sorted = [x[0] for x in tu_sorted]
df = pd.DataFrame({'description':lines, 'vacid':vacids})
df = df.set_index('vacid')

In [None]:
df.loc[tu_sorted[]].description