In [10]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
import pickle
import tqdm
import pymorphy2
import logging
import os
from string import punctuation
from nltk import TreebankWordTokenizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
from sqlalchemy import create_engine
from src.config import conn_string

In [11]:
morph = pymorphy2.MorphAnalyzer()
logging.basicConfig(level="INFO")

INFO:pymorphy2.opencorpora_dict.wrapper:Loading dictionaries from /Users/lev4/PycharmProjects/app-sberjobs/sberjobs-trainer/venv/lib/python3.8/site-packages/pymorphy2_dicts_ru/data
INFO:pymorphy2.opencorpora_dict.wrapper:format: 2.4, revision: 417127, updated: 2020-10-11T15:05:51.070345


In [12]:
def get_lines(conn_string):
    """
    Подключается к БД и выкачивает вакансии
    """
    logging.info("Подгружаю данные из базы")
    engine = create_engine(conn_string)

    df = pd.read_sql_table('vacancy', engine)
    logging.info(df.head)
    lines = df.vacdescription.tolist()
    vacids = df.vacid.tolist()
    return lines, vacids

In [13]:
def txt_pipe(lines):
    logging.info("Готовлю корпус")
    ru_stop_words = stopwords.words('russian')
    lines_tok = [TreebankWordTokenizer().tokenize(x) for x in lines]
    lines_tok = [[x for x in el if x not in punctuation] for el in lines_tok]
    u_norm = [[morph.parse(x)[0][2] for x in el] for el in tqdm(lines_tok)]
    u_norm = [[x for x in el if x not in ru_stop_words] for el in tqdm(u_norm)]
    corpus = [' '.join(x) for x in u_norm]
    return corpus

In [14]:

def l2_norm(x):
    return np.sqrt(np.sum(x ** 2))


def div_norm(x):
    norm_value = l2_norm(x)
    if norm_value > 0:
        return x * (1.0 / norm_value)
    else:
        return x

In [15]:
def get_vacancy_vectors(vacids, corpus):
    """
    Получает вектора профилей пользователей из фасттекста
    """

    from gensim.models import FastText
    
    vacancy_vectors = {}
    logging.info("Подгружаем обученную модель FastText")
    fasttext_pth = os.path.join('..','wvmodel','cc.ru.300.bin')
    fast_text = FastText.load_fasttext_format(fasttext_pth).wv
    
    logging.info("Собираем векторы предложений")
    for x in tqdm((vacids, corpus)):

        text = x[1].split()
        text.append('\n')
        matrix = np.zeros((300,), dtype = 'float32')

        for word in text:
            matrix += div_norm(fast_text.word_vec(word))

        vacancy_vectors[x[0]] = matrix

In [7]:

def get_similarities(target_user, candidates):
    """Получает косинусные сходства сотрудника и кандидатов"""

    tu_sims = {}
    for vacid in vacids:
        tu_sims[candidates] = cosine_similarity(
            vacancy_vectors[vacid],
            user_vectors
        )[0][0]

    return tu_sims

In [8]:
lines, vacids = get_lines(conn_string)

NameError: name 'conn_string' is not defined

In [9]:
corpus = txt_pipe(lines)

NameError: name 'lines' is not defined

In [13]:
# with open(os.path.join('..','data','corpus.pkl'), 'wb') as f:
#     pickle.dump(corpus, f)

# with open(os.path.join('..','data','vacids.pkl'), 'wb') as f:
#     pickle.dump(vacids, f)

In [14]:
with open(os.path.join('..','data','vacids.pkl'), 'rb') as f:
    vacids = pickle.load(f)

with open(os.path.join('..','data','corpus.pkl'), 'rb') as f:
    corpus = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: '../data/vacids.pkl'

In [None]:
vvect = get_vacancy_vectors(vacids, corpus)

In [None]:
# logging.info("Подгружаем обученную модель FastText")

In [17]:
from gensim.models import fasttext
fasttext_pth = os.path.join('..','wvmodel','ft_native_300_ru_wiki_lenta_lemmatize.bin')
fast_text = fasttext.load_facebook_vectors(fasttext_pth)

# with open(os.path.join('..','data','fast_text.pkl'), 'rb') as f:
#     fast_text = pickle.load(f)

INFO:gensim.models._fasttext_bin:loading 977837 words for fastText model from ../wvmodel/ft_native_300_ru_wiki_lenta_lemmatize.bin
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.word2vec:Updating model with new vocabulary
INFO:gensim.models.word2vec:New added 977837 unique words (50% of original 1955674) and increased the count of 977837 pre-existing words (50% of original 1955674)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 977837 items
INFO:gensim.models.word2vec:sample=0.0001 downsamples 788 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 830512760 word corpus (120.7% of prior 688220651)
INFO:gensim.models.fasttext:loaded (2977837, 300) weight matrix for fastText model from ../wvmodel/ft_native_300_ru_wiki_lenta_lemmatize.bin


In [None]:
# with open(os.path.join('..','data','fast_text.pkl'), 'wb') as f:
#     pickle.dump(fast_text, f)

In [None]:
vacancy_vectors = {}
for x in tqdm(list(zip(vacids, corpus))):
    text = x[1].split()
    text.append('\n')
    matrix = np.zeros((300,), dtype = 'float32')
    for word in text:
        matrix += div_norm(fast_text.word_vec(word))
    vacancy_vectors[x[0]] = matrix

In [None]:
text = "python git data science machine learning"
text = "сми репутация сторителлинг фактчекинг пресс-релиз коммуникация pr журналист москва"
text = text.split()
text.append('\n')
matrix = np.zeros((300,), dtype = 'float32')
for word in text:
    matrix += div_norm(fast_text.word_vec(word))

In [None]:
tu_sims = {}
for vacid in tqdm(vacids):
    tu_sims[vacid] = cosine_similarity(vacancy_vectors[vacid].reshape(1,-1),
                                       matrix.reshape(1,-1))[0][0]

In [None]:
tu_sorted = sorted(tu_sims.items(), key=lambda x:x[1], reverse=True)
tu_sorted = [x[0] for x in tu_sorted]
df = pd.DataFrame({'description':lines, 'vacid':vacids})
df = df.set_index('vacid')

In [None]:
df.loc[tu_sorted[]].description