In [1]:
import pickle
import pandas as pd
from pprint import pprint
import numpy as np
from sklearn.cluster import KMeans
from xgboost import XGBRegressor
import random
import sklearn
import random
import time
import string as _string

import pymorphy2 as pm
from sentence_transformers import SentenceTransformer
from parse_hh_data import download, parse

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
#model = SentenceTransformer('distiluse-base-multilingual-cased-v2', device='cpu')
#model.save('/hack/distiluse-base-multilingual-cased-v2_cpu')
model = SentenceTransformer('gdrive/MyDrive/coding/classification_telegram/distiluse-base-multilingual-cased-v2_cpu', device='cpu')

In [3]:
morph = pm.MorphAnalyzer()

TO_REPLACE = ['<p>', '<em>', '<br />', '<ul>', '</p>', '</li>', '<li>', '</strong>', '<strong>', '</ul>', '</em>', '</ol>', '<ol>']
LETTERS_ENG = 'abcdefghijklmnopqrstuvwzyx +#&/'
LETTERS_ENG += LETTERS_ENG.upper()
LETTERS_RUS = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя '
LETTERS_RUS += LETTERS_RUS.upper()

POS_EXCLUDED = ['PREP', 'NPRO', 'CONJ', 'PRCL']
LENGTH = 3 * 23

SKILLS = set([i.lower() for i in pd.read_csv('key_skills.csv')['skill_name']])
COURSES = list(pd.read_csv('courses.csv')['course_name'])

N_CLUSTERS_FOR_SIMILARITY = 3
N_IDS_PER_CLUSTER_FOR_SIMILARITY = 2

In [30]:
def parse_key_skills(lst):
    return ' '.join([i['name'] for i in lst])


def parse_specializations(lst):
    return ' '.join([f"{i['name']} {i['profarea_name']}" for i in lst])


def clean_string(string):
    for i in TO_REPLACE:
        string = string.replace(i, ' ')
        
    string = remove_extra_spaces(string)
        
    return string
    
    
def parse_description(string):
    #string = string.replace('junior', ' ').replace('middle', ' ')
    lst = string.split()
    lst = [i for i in lst if i != 'и']
    string = ' '.join(lst)
    string = clean_string(string)
    return string


def remove_extra_spaces(string):
    lst = string.split()
    string = ' '.join(lst)
    return string


def get_eng(string):
    string = ''.join([i for i in string if i in LETTERS_ENG])
    lst = string.split()
    lst = [i for i in lst if len(i) >= 1 or i.lower() == 'c']
    string = ' '.join(lst)
    return string


def get_rus(string):
    string = ''.join([i for i in string if i in LETTERS_RUS])
    return string


def get_part_of_speech(string):
    return morph.parse(string)[0].tag.POS


def get_main(string):
    return ' '.join(string.split()[:LENGTH])


def remove_POS(string):
    lst = string.split()
    lst = [i for i in lst if get_part_of_speech(i) not in POS_EXCLUDED]
    string = ' '.join(lst)
    return string


def get_vacancy_eng_share(string):
    if len(string) == 0:
        return 1
        
    eng = get_eng(string)
    return len(eng) / len(string)


def get_prepared(string):
    return ''.join([i for i in string if i in LETTERS_ENG])



def get_all_skills(key_skills, eng):
    words_key_skills = key_skills.lower().split()
    words_eng = eng.lower().split()
    all_skills = set(words_key_skills) | set(words_eng)
    
    return ' '.join(all_skills)


def get_key_skills_from_eng(string):
    string = string.lower()
    lst = string.split()
    lst = [i for i in lst if i in SKILLS]
    return ' '.join(lst)



def get_vacancy_target_info(dct):
    key_skills = parse_key_skills(dct['key_skills'])
    key_skills = key_skills.replace('С++', 'C++')
    specializations = parse_specializations(dct['specializations'])
    description = parse_description(dct['description'])
    description = description.replace('С++', 'C++')
    
    name = dct['name']
    
    vacancy_data = dict()
    vacancy_data['id'] = dct['id']
    vacancy_data['string'] = f'{name} {key_skills} {description}'
    vacancy_data['experience'] = dct['experience']['name']
    #vacancy_data['salary_our'] = '>160'
    vacancy_data['salary'] = dct['salary']
    vacancy_data['created_at'] = dct['created_at']
    vacancy_data['key_skills'] = key_skills
    vacancy_data['specializations'] = specializations
    vacancy_data['description'] = description
    vacancy_data['name'] = name
    
    vacancy_data['string'] = clean_string(vacancy_data['string'])
    vacancy_data['eng'] = get_eng(vacancy_data['string'])
    vacancy_data['eng'] = get_key_skills_from_eng(vacancy_data['eng'])  # ! replacing eng by skills
    vacancy_data['all_skills'] = get_all_skills(vacancy_data['key_skills'], vacancy_data['eng'])
    
    vacancy_data['rus'] = get_rus(vacancy_data['string'])
    vacancy_data['rus'] = remove_POS(vacancy_data['rus'])
    vacancy_data['rus_short'] = get_main(vacancy_data['rus'])

    return vacancy_data


def get_vector(sentence):
    return model.encode([sentence])[0]


def get_clusters(df_column, n_clusters):
    clustering_model = KMeans(n_clusters=n_clusters,
                              n_init=500,
                              max_iter=10000)

    clustering_model.fit(list(df_column))
    clusters = clustering_model.labels_
    return clusters


def get_chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
        
def get_centroid(arrs):
    arrs = np.array(arrs)
    centroid = np.mean(arrs, axis=0)
    return centroid


def get_salary(dct):
    if not dct:
        return 0
    
    from_ = dct['from']
    to = dct['to']
        
    lst = [from_, to]
    lst = [i for i in lst if i]
    
    if len(lst) == 0:
        return ''
    
    salary = sum(lst) / len(lst)
    currency = dct['currency']
    
    if currency == 'USD':
        salary = salary * 72
        
    if currency == 'EUR':
        salary = salary * 87
        
    return salary


def get_train_test(length):
    lst = ['train'] * int(round(length * 0.8, 0))
    lst += ['test'] * (length - len(lst))
    random.shuffle(lst)
    return lst


def xgb_predict_salaries(df):
    x_train = df[df['train_test'] == 'train']['all_skills_vector']
    y_train = df[df['train_test'] == 'train']['salary_parsed']
    x_test = df[df['train_test'] == 'test']['all_skills_vector']
    y_test = df[df['train_test'] == 'test']['salary_parsed']
    
    mapping_y_train = {tuple(i): j for i, j in zip(df[df['train_test'] == 'train']['all_skills_vector'],
                                                   df[df['train_test'] == 'train']['salary_parsed'])}

    x_train = np.array([np.array(i) for i in x_train])
    x_test = np.array([np.array(i) for i in x_test])
    
    xgbr = XGBRegressor() # dont wanna increase n_estimators, max_depth and learning_rate for mvp
    xgbr.fit(x_train, y_train)

    y_pred = xgbr.predict(x_test)
    mapping_y_pred = {tuple(i): j for i, j in zip(x_test, y_pred)}
    
    
    def get_salary_predicted(vector):
        if vector in mapping_y_train:
            return mapping_y_train[vector]
        
        if vector in mapping_y_pred:
            return mapping_y_pred[vector]
    
    df['salary_predicted'] = [get_salary_predicted(tuple(i)) for i in df['all_skills_vector']]
    return xgbr, df


def xgb_predict_salary(xgbr, vector):
    return int(round(xgbr.predict(np.array([vector]))[0], 0))


def get_similarity(vec1, vec2):
    similarity = sklearn.metrics.pairwise.cosine_similarity([vec1], [vec2])[0][0]
    return similarity


def find_most_similar_id_by_vector(target_vector, series, ids):
    similarity_data = {get_similarity(target_vector, vector): vector for vector in series}
    most_similar_coef = max(similarity_data)
    most_similar_vector = similarity_data[most_similar_coef]
    for id_, vector in zip(ids, series):
        if list(vector) == list(most_similar_vector):
            return id_
        
        
def sort_dict(d, **kwargs):
    by = 'value'
    reverse = True
    
    if 'by' in kwargs:
        by=kwargs['by']
    
    if 'reverse' in kwargs:
        reverse=kwargs['reverse']
    
    if by == 'value':
        return dict(sorted(d.items(), key=lambda x: x[1], reverse=reverse))
    if by == 'key':
        return dict(sorted(d.items(), key=lambda x: x[0], reverse=reverse))
        
        
def get_similarity_ids_by_vector(target_vector, series, ids):  # from high similarity to low
    similarity_data = {get_similarity(target_vector, vector): vector for vector in series}
    similarity_data = sort_dict(similarity_data, by='key', reverse=True)
    
    similarity_ids = list()
    for similarity_vector in similarity_data.values():
        for id_, vector in zip(ids, series):
            if list(vector) == list(similarity_vector):
                similarity_ids.append(id_)
                break
                
    return similarity_ids


def get_similarity_clusters_by_vector(target_vector, df):  # from high similarity to low
    df_ = df.copy()
    df_ = df_.drop_duplicates('cluster_all_skills')
    series = df_['centroid_all_skills']
    clusters = df_['cluster_all_skills']
    
    similarity_data = {get_similarity(target_vector, vector): vector for vector in series}
    similarity_data = sort_dict(similarity_data, by='key', reverse=True)
    
    similarity_ids = list()
    for similarity_vector in similarity_data.values():
        for cluster, vector in zip(clusters, series):
            if list(vector) == list(similarity_vector):
                similarity_ids.append(cluster)
                break
                
    return similarity_ids


def find_all_similar(target_vector, df):
    similarity_clusters = get_similarity_clusters_by_vector(target_vector, df)
    similarity_clusters = similarity_clusters[:N_CLUSTERS_FOR_SIMILARITY]
    similar_ids = dict()
    
    
    for cluster in similarity_clusters:
        df_ = df[df['cluster_all_skills'] == cluster]
                 
        similarity_ids = get_similarity_ids_by_vector(target_vector, df_['all_skills_vector'], df_['id'])
        similarity_ids = similarity_ids[:N_IDS_PER_CLUSTER_FOR_SIMILARITY]
                 
        similar_ids[cluster] = similarity_ids
                 
    return similar_ids


def get_description_by_id(target_id, df):
    for id_, description in zip(df['id'], df['description']):
        if id_ == target_id:
            return clean_string(description)
        
        
def get_eng_by_id(target_id, df):
    for id_, eng in zip(df['id'], df['eng']):
        if id_ == target_id:
            return clean_string(eng)
        
        
def get_key_skills_by_id(target_id, df):
    for id_, key_skills in zip(df['id'], df['key_skills']):
        if id_ == target_id:
            return clean_string(key_skills)
        
        
def get_all_skills_by_id(target_id, df):
    for id_, all_skills in zip(df['id'], df['all_skills']):
        if id_ == target_id:
            return all_skills


def get_id_data(id_, df, cv):
    id_data = dict()
    description = get_description_by_id(id_, df)
    eng = get_eng_by_id(id_, df)
    key_skills = get_key_skills_by_id(id_, df)
    all_skills = get_all_skills_by_id(id_, df)
    
    id_data['description'] = description
    #id_data['eng'] = eng
    id_data['key_skills'] = key_skills
    id_data['all_skills'] = all_skills
    new_skills = get_new_skills(cv, id_, df)
    id_data['course_recommended'] = find_closest_cource(new_skills)
    
    return id_data


def fill_vacancies(data, df, cv):
    for cluster in data['vacancies']:
        for n, id_ in enumerate(data['vacancies'][cluster]):
            id_data = get_id_data(id_, df, cv)
            data['vacancies'][cluster][n] = id_data
            
    return data


def replace_punctuation(string):
    for i in ['.', ',']:
        string = string.replace(i, '')
        
    return string


def get_new_skills(cv, vacancy_id, df):
    words_cv = set(cv.lower().split())
    words_cv = set([replace_punctuation(i) for i in words_cv])
    
    all_skills = set(get_all_skills_by_id(vacancy_id, df))
    new_skills = (all_skills - words_cv) & SKILLS
    
    return ' '.join(new_skills)


def find_closest_cource(new_skills):
    vector_new_skills = get_vector(new_skills)
    similarity_data = {get_similarity(vector_new_skills, vector): course for course, vector in COURSES_VECTORS.items()}
    most_similar_coef = max(similarity_data)
    most_similar_course = similarity_data[most_similar_coef]
    return most_similar_course


def get_bucker(integer):
    data = {
        'name': ['student', 'junior', 'almost_middle', 'middle', 'almost_senior', 'senior'],
        'min_sal': [-1, 50000, 75000, 150000, 220000, 350000],
        'max_sal': [50000, 75000, 150000, 220000, 350000, 99999999],
        'description': ['Вы в самом начале своего карьерного пути. Начните с общих курсов, чтобы определить своё направление развития',
                        'Вы уже что-то умеете и не способны самостоятельно развиваться в рабочей команде.',
                        'Вы уже не новичёк, но ещё не можете автономно лидировать задачи по разработке',
                        'Вы полезный специалист в любой комаде. Продолжайте развиваться, что бы стать незаменимым специалистом в люой команде',
                        'Ваши знания и опыт работы велики, но есть ещё непокорённые вершины в вашем направлении развития',
                        'Поздравляем - вы сеньор-разработчик']
    }
    for name, min_sal, max_sal, desc in zip(data['name'], data['min_sal'], data['max_sal'], data['description']):
        if min_sal < integer <= max_sal:
            return {name: desc}
        
    return {'noname': 'молодец'}

In [8]:
# EARLIER PARSED VACANSIES FROM HH. 50 for example

In [9]:
with open('vacancies.pickle', 'rb') as f:
    vacancies = pickle.load(f)

In [16]:
N_CLUSTERS = 10
MAX_ENG_SHARE = 0.5
vacancies_data = list()
mapping_centroids_eng = dict()

COURSES_VECTORS = {i: get_vector(i) for i in COURSES}

for vacancy in vacancies:
    vacancy_data = get_vacancy_target_info(vacancy)
    vacancies_data.append(vacancy_data)
    
df = pd.DataFrame(vacancies_data)

df['eng_share'] = df['description'].apply(get_vacancy_eng_share)
df = df[df['eng_share'] < MAX_ENG_SHARE]

df['all_skills_vector'] = df['all_skills'].apply(get_vector)
df['cluster_all_skills'] = get_clusters(df['all_skills_vector'], N_CLUSTERS)

for cluster, group in df.groupby('cluster_all_skills'):
    mapping_centroids_eng[cluster] = get_centroid(group['all_skills_vector'])
    
df['centroid_all_skills'] = [mapping_centroids_eng[i] for i in df['cluster_all_skills']]

df['salary_parsed'] = df['salary'].apply(get_salary)

df['train_test'] = get_train_test(len(df))
xgbr, df = xgb_predict_salaries(df)
df.head(10)

Unnamed: 0,id,string,experience,salary,created_at,key_skills,specializations,description,name,eng,all_skills,rus,rus_short,eng_share,all_skills_vector,cluster_all_skills,centroid_all_skills,salary_parsed,train_test,salary_predicted
0,43377259,Senior Python Developer Python Django Framewor...,От 3 до 6 лет,"{'from': 250000, 'to': 300000, 'currency': 'RU...",2021-06-10T11:36:44+0300,Python Django Framework Architecture,"Программирование, Разработка Информационные те...",Мы продуктовая финтех компания занимающаяся ра...,Senior Python Developer,python developer python django framework archi...,python django developer architecture framework,продуктовая финтех компания занимающаяся разра...,продуктовая финтех компания занимающаяся разра...,0.076132,"[0.0033326473, 0.029267026, 0.055926673, -0.00...",9,"[0.035459224, 0.012453435, 0.017729176, -0.016...",275000.0,train,275000.0
1,44777308,Senior Elixir Developer elixir otp Kubernetes ...,От 3 до 6 лет,"{'from': 200000, 'to': 300000, 'currency': 'RU...",2021-06-11T12:18:42+0300,elixir otp Kubernetes elixir developer Ruby Mo...,"Программирование, Разработка Информационные те...",Convead (convead.ru) - успешный MarTech проект...,Senior Elixir Developer,elixir developer elixir otp kubernetes elixir ...,mongodb otp kubernetes developer ruby elixir r...,успешный проект Российском только рынке уже не...,успешный проект Российском только рынке уже не...,0.121488,"[0.055097055, 0.0039363625, 0.021233737, -0.01...",1,"[0.010587063, 0.015211941, 0.018781781, -0.011...",250000.0,train,250000.0
2,43889154,Senior Kotlin / Java Developer Java Spring Fra...,От 3 до 6 лет,"{'from': 270000, 'to': 350000, 'currency': 'RU...",2021-06-11T12:56:03+0300,Java Spring Framework MongoDB Kotlin,"Другое Информационные технологии, интернет, те...","FunCorp — международная компания, которая зани...",Senior Kotlin / Java Developer,kotlin java developer java spring framework mo...,web jenkins kotlin framework redis spring mong...,международная компания которая занимается разр...,международная компания которая занимается разр...,0.132689,"[0.030061487, 0.01872741, -0.0038083724, -0.07...",2,"[0.008026898, -0.0013476545, 0.032130916, -0.0...",310000.0,train,310000.0
3,44089959,Senior iOS Developer Objective-C C++ Software ...,От 3 до 6 лет,"{'from': 250000, 'to': 350000, 'currency': 'RU...",2021-06-09T20:00:41+0300,Objective-C C++ Software Development iOS Git,"Игровое ПО Информационные технологии, интернет...",Разыскиваем талантливых разработчиков мобильны...,Senior iOS Developer,ios developer c++ software development ios git...,software c++ sdk objective-c patterns git swif...,Разыскиваем талантливых разработчиков мобильны...,Разыскиваем талантливых разработчиков мобильны...,0.161972,"[0.0010126481, 0.034491982, 0.044540405, -0.03...",7,"[0.0021770564, 0.029823953, 0.037420888, -0.00...",300000.0,train,300000.0
4,41690777,Senior BackEnd Developer MySQL PHP Laravel Doc...,От 3 до 6 лет,"{'from': 2000, 'to': 4000, 'currency': 'USD', ...",2021-06-09T13:59:35+0300,MySQL PHP Laravel Docker Kubernetes,"Программирование, Разработка Информационные те...",Если вы любите компьютерные технологии решаете...,Senior BackEnd Developer,developer mysql php laravel docker kubernetes ...,mysql web unix sql python symfony aws js kuber...,любите компьютерные технологии решаете задачи ...,любите компьютерные технологии решаете задачи ...,0.070376,"[0.009571149, 0.005527074, 0.032651808, -0.028...",3,"[0.007660319, 0.015559648, 0.030937389, -0.023...",216000.0,train,216000.0
5,40918556,Middle/Senior iOS Developer iOS Swift Objectiv...,От 3 до 6 лет,"{'from': 300000, 'to': 400000, 'currency': 'RU...",2021-06-11T12:59:28+0300,iOS Swift Objective-C,"Программирование, Разработка Информационные те...",Обязанности: Разработка всей кодовой базы с ну...,Middle/Senior iOS Developer,ios developer ios swift swift os ios swift ios...,swift developer ios os sdk objective-c,Обязанности Разработка всей кодовой базы нуля ...,Обязанности Разработка всей кодовой базы нуля ...,0.130031,"[0.012093089, 0.03546675, 0.043384444, 0.00549...",7,"[0.0021770564, 0.029823953, 0.037420888, -0.00...",350000.0,train,350000.0
6,45347763,Senior/Lead IOS разработчик Developer (удаленн...,От 1 года до 3 лет,"{'from': 200000, 'to': 350000, 'currency': 'RU...",2021-06-09T15:58:03+0300,iOS ООП Swift Alamofire ObjectMapper VIPER,"Программирование, Разработка Информационные те...",Broniboy позволяет получить что угодно за пару...,Senior/Lead IOS разработчик Developer (удаленно),ios developer ios swift alamofire objectmapper...,viper alamofire swift objectmapper developer i...,разработчик удаленно ООП позволяет получить уг...,разработчик удаленно ООП позволяет получить уг...,0.088267,"[0.016616255, 0.041644353, 0.042456433, -0.022...",7,"[0.0021770564, 0.029823953, 0.037420888, -0.00...",275000.0,train,275000.0
7,45253239,Senior java developer Java Java SE Multithread...,Более 6 лет,"{'from': 250000, 'to': None, 'currency': 'RUR'...",2021-06-10T11:05:32+0300,Java Java SE Multithread Programming Streaming...,"Банковское ПО Информационные технологии, интер...",Команда разработки банковских антифрод-решений...,Senior java developer,java developer java java se multithread progra...,oracle web c++ multithread se algorithms pytho...,Команда разработки банковских антифродрешений ...,Команда разработки банковских антифродрешений ...,0.111465,"[0.05044336, 0.009022888, -0.023772439, -0.048...",9,"[0.035459224, 0.012453435, 0.017729176, -0.016...",250000.0,test,272482.40625
8,44915450,IOS Senior Developer (Swift) Разработка ПО iOS...,От 3 до 6 лет,"{'from': 200000, 'to': 350000, 'currency': 'RU...",2021-06-09T10:22:42+0300,Разработка ПО iOS Scrum Kanban Teamleading рук...,"Программирование, Разработка Информационные те...",Мы в Municorn cоздаем собственные мобильные b2...,IOS Senior Developer (Swift),ios developer swift ios scrum kanban teamleadi...,руководство atlassian разработчиков kanban jir...,Разработка руководство командой разработчиков ...,Разработка руководство командой разработчиков ...,0.113839,"[-0.05257514, 0.030931829, 0.024867019, -0.051...",5,"[-0.018049467, 0.04143631, 0.022029694, -0.029...",275000.0,train,275000.0
9,45304742,Программист С++ Senior ООО &quot;Айти Прайм&qu...,От 3 до 6 лет,"{'from': 300000, 'to': None, 'currency': 'RUR'...",2021-06-11T14:09:00+0300,,"Программирование, Разработка Информационные те...",ООО &quot;Айти Прайм&quot; является современно...,Программист С++ Senior,c++ c++ stl linux linux,c++ linux stl,Программист ООО Айти Прайм является современно...,Программист ООО Айти Прайм является современно...,0.084843,"[0.04793832, 0.025557855, 0.00011687074, 0.029...",9,"[0.035459224, 0.012453435, 0.017729176, -0.016...",300000.0,train,300000.0


In [29]:
# lets test using some short cv

In [20]:
cv = 'Java kubernetes я знаю лучше всех'
cv_prep = get_prepared(cv)
cv_vector = get_vector(cv_prep)

most_similar_id = find_most_similar_id_by_vector(cv_vector, df['all_skills_vector'], df['id'])
predicted_salary = xgbr.predict(np.array([cv_vector]))

print(f'most_similar_vacancy_id = {most_similar_id}')
print(f'predicted_salary = {predicted_salary[0]}')  # sample 50 vacancies include only seniors, so predicted salary would be very high

most_similar_vacancy_id = 22908012
predicted_salary = 305037.09375


In [28]:
# data below: key 'vacancies' consists of dict of three nearest clusters.
# clusters constists of nearest vacancies

# 'course_recommended' - nearest course for technologies not presented in cv, but present is this vacancy

In [27]:
similarity_data = find_all_similar(cv_vector, df)
data = {'vacancies': similarity_data}
data['salary'] = xgb_predict_salary(xgbr, cv_vector)
data['bucket'] = get_bucker(data['salary'])
data = fill_vacancies(data, df, cv)
pprint(data)

{'bucket': {'almost_senior': 'Ваши знания и опыт работы велики, но есть ещё '
                             'непокорённые вершины в вашем направлении '
                             'развития'},
 'salary': 305037,
 'vacancies': {2: [{'all_skills': 'spring java developer postgresql scala boot '
                                  'core framework hibernate',
                    'course_recommended': ' Java Intro Java SE,',
                    'description': 'Продукт: Аналитическая платформа для '
                                   'управления пользовательскими данными До '
                                   '500М событий в сутки, 25 Tb средний объем '
                                   'пользовательских данных; Граф '
                                   'идентификаторов пользователей насчитывает '
                                   '10-ки миллиардов записей; Стек: Java, '
                                   'Spring Boot, Scala, Spark, Kafka, '
                                   'Hibernate, k8s