In [169]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from statistics import mean
from tqdm import tqdm
from transformers import BertConfig, BertModel
import nltk

In [170]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('D:/PythonProjects/Projects/Resumes_AutoSearch/all-MiniLM-L6-v2')
model = BertModel.from_pretrained('D:/PythonProjects/Projects/Resumes_AutoSearch/all-MiniLM-L6-v2')

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [172]:
resumes = pd.read_excel('../data/prepared_data/All_candidates_DF_translated.xlsx', engine='openpyxl')

resumes = resumes.loc[resumes['sentences'] != '']
resumes = resumes.loc[resumes['sentences'] != ' ']
resumes['sentences'].replace('', np.nan, inplace=True)
resumes['sentences'].replace(' ', np.nan, inplace=True)
resumes.dropna(subset=['sentences'], inplace=True)

resumes

Unnamed: 0,id,url,position_name,sentences,sentence_vectors
0,0000b0630008f7dae600000b394b6f6f6c5757,https://hh.ru/resume/0000b0630008f7dae600000b3...,manager,"Working with clients, financial and economic m...",
1,0000b0630008f7dae600000b394b6f6f6c5757,https://hh.ru/resume/0000b0630008f7dae600000b3...,director,"Working with clients, financial and economic m...",
2,0000b0630008f7dae600000b394b6f6f6c5757,https://hh.ru/resume/0000b0630008f7dae600000b3...,manager,working with clients,
3,000257bd00033c09c900000b394b747732534b,https://hh.ru/resume/000257bd00033c09c900000b3...,лаборант,"introduction of documentation, entry of inform...",
4,00035ec20007aaa76f00000b397863726a786f,https://hh.ru/resume/00035ec20007aaa76f00000b3...,Junior python developer,"Backend, Django, REST",
...,...,...,...,...,...
164908,fffe13c50001498d3a00000b39774245797279,https://hh.ru/resume/fffe13c50001498d3a00000b3...,Ведущий Разработчик,* development of an electronic system for fili...,
164909,fffe13c50001498d3a00000b39774245797279,https://hh.ru/resume/fffe13c50001498d3a00000b3...,Ведущий разработчик,* development of native iOS application for Sm...,
164910,fffe13c50001498d3a00000b39774245797279,https://hh.ru/resume/fffe13c50001498d3a00000b3...,Ведущий разработчик PHP/Neo4J,* development of mail service (PHP/Dovecot)\n*...,
164911,fffe13c50001498d3a00000b39774245797279,https://hh.ru/resume/fffe13c50001498d3a00000b3...,Ведущий разработчик,"* development of payment system Express, Kazak...",


In [174]:
def get_embeddings(sentences):
    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

sentence_embeddings = []


with tqdm(total=len(resumes)) as pbar:
    for index, row in resumes.iterrows():
        try:
            sentence_vectors = get_embeddings(sentences=nltk.sent_tokenize(row['sentences']))
        except TypeError:
            continue
        sentence_embeddings += sentence_vectors
        resumes['sentence_vectors'] = resumes['sentence_vectors'].astype('object')
        resumes.at[index, 'sentence_vectors'] = sentence_vectors
        pbar.update(1)

resumes.dropna(subset=['sentence_vectors'], inplace=True)

100%|█████████▉| 164651/164671 [1:52:18<00:00, 24.43it/s]  


In [293]:
example_resumes_Cato = [
{   'id_resume': 1,
    'url': '1',
    'company': '',
    'position_name': 'Graphics Engine Specialist',
    'sentences': [
        'Computer graphics API: Vulkan, DX12',
        'Worked on postrocessing stack for rendering engine',
        'Implemented GPU skinning animation',
        'Developed Vulkan rendering engine for style applications',
        'Making offline rendering real-time on Radeon GPU',
        'I am a graphics software engineer in a highly-focused Graphics Performance Analyzers team dealing with the design and development of Intel GPA(R)',
        'computer graphics algorithms in one or more technical fields in physical simulation, rendering,3D-modeling, animation',
        'deep understanding of GPU rendering technology principles and performance optimization methods',
        'development of related technical algorithms such as graphics engine modeling, animation, rendering, physical simulation, material, and spatial calculation',
        'Development rendering engine',
        'Development of 3D tools for working with the model',
        'Modification of the used 3D engine to new requirements',
    ],
        'sentence_vectors': [],
},
]


example_resumes_Sun_Stars_Resumes = [
{   'id_resume':1,
    'url': '1',
    'company': '',
    'position_name': 'Multidimensional high precision mathematical modeling engineer',
    'sentences': [
        'provide electron optics devices for multi e-beam lithography machine ',
        'Successfully transferred the production of three types MEMS electron optics devices',
        'Development of micro and nanostructures characterization capabilities ',
        'Development of characterization methods for optical components',
        'Extensive experience in surface analysis techniques: AFM, SNOM, STM, SEM, LEED, XPS/UPS, optical microscopy, single molecule FRET SNOM',
        'The building and tuning of different optical setups including fiber optics and lasers systems. ',
        'Mechanical design (using CAD: SolidWorks) of devices: heads for the scanning probe microscopes, the rotation system for samples in the chamber of the metal sputtering machine, setup for the fibers etching. ',
        'Simulation of dispersion of photonic crystal structures (Matlab) ',
        'Engineering of scanning systems for microscopes. ',
        'Software development for the operation of the atomic force microscope',
        'Experience in clean-room SEM/FIB/CVD (operating in an ultra-high vacuum (UHV)), knowledge of lithography process',
        'Development of polymer fiber tips for microscope',
        'Optical metrology and image processing algorithms development',
        'Characterization of flat diffractive optical element',
        'Developing and commercializing optical technologies for wide field-of-view imaging',
        'Established experimental methods for the electro-optical characterization increased the efficiency of design-fabrication-characterization chain',
        'Research of organic micro-optical devices, building of advanced optical characterization systems',
    ],
        'sentence_vectors': [],
},
]


example_resumes_Sun_Stars_positions = [
{   'id_resume':1,
    'url': '1',
    'company': '',
    'position_name': 'Optical Modelling & Simulation Specialist',
    'sentences': [
        'optical modelling and simulation capability to support the development of ultra high precision optical system',
        'optical simualtion software and implementing relevant numerical simulation techniques.',
        'Matlab, Python or C++.',
        'industrial R&D scientist, physical modeling design engineer',
        'semiconductor optical metrology',
        'researching the fundamental diffraction physics of the light-matter interaction at the nanoscale',
        'image and signal formation physics of the optical sensors',
        'degree in theoretical physics, complemented of academic research experience in the field of fundamental precision measurements',
        'laser interferometric gravitational-wave observatories, and on quantum optomechanics with micromechanical oscillators',
        'classical optics and photonics',
        'optical coherence, aberration theory and imaging',
        'diffraction and scattering theory',
        'microscopy, scatterometry and polarimetry',
        'semiconductor and nanostructure metrology',
        'quantum mechanics and quantum optics',
        'laser interferometry and optomechanics',
        'electromagnetic and optics simulations',
        'general relativity and gravitational-wave theory',
        'MATLAB coding and software development',
        'numerical simulations studied several YieldStar sensor concepts for the future in-device metrology applications',
        'Developed an approximate but fully analytical model of diffraction of light by diffraction gratings, and of the overlay signal formation in metrology targets and semiconductor device structures',
        'The developed theory suggested novel overlay inference algorithms in both Fourier and image domains for various applications',
        ' numerical simulations studied the conceptual design options for the optical sensor aimed at boosting the accuracy, robustness and precision of in-device after-etch overlay measurements',
        'optical sensor for the future after-litho overlay metrology system',
        ' semiconductor metrology needs, created a high-level overview of the key challenges in optical overlay inspection, and compiled a short-list of potential hardware and algorithmic solutions',
        'studied the dynamical radiation pressure effects in optomechanical systems',
        'optomechanical dynamics which can be observed in small-scale systems (can be applied for e.g. testing quantum mechanics of macroscopic objects, and improving the sensitivity of micro-sensors)',
        'Developed a theoretical model for the table-top experiment on optomechanical cooling of a micromechanical oscillator',
        ' developed quantum noise reduction techniques (displacement-noise-free interferometry, speed-meter topology, etc.) for the 3rd-generation pan-European Einstein Telescope laser gravitational-wave observatory',
        'Developed a MATLAB program with GUI for statistical analysis of the data flow from the cosmic-rays detector hardware of a space satellite',
        'Mathematical and numerical modeling',
        'Development of a program in MATLAB that simulates heating of a three-dimensional body by laser radiation',
        'Modeling of optical processes.',
        'Fiber laser modeling.',
        'Simulation of pulse mode of laser operation at mode synchronization by saturating absorbers, method of nonlinear evolution of pulse polarization, acousto-optic modulation',
        'Mathematical modeling in optical systems',
        'Mathematical modeling, program Mir physical models',
    ],
        'sentence_vectors': [],
},


{   'id_resume':2,
    'url': '1',
    'company': '',
    'position_name': 'Optical Algorithm & Simulation Specialist',
    'sentences': [
        'build the optical algorithm & simulation capability to support the development of ultra high precision optical system',
        'computational mathematics, applied mathematics, physical optics, and computational optics',
        'knowledge of geometric, physical, and micronano optical systems, and have strong sequential, non-sequence, physical, and micronano optical modeling capabilities',
        'optical precision measurement algorithms, be familiar with FIR, FFT, and other signal processing knowledge, and have extensive experience in signal analysis and processing.',
        'numerical linear algebra, finite element, partial differential equation, large-scale matrix calculation, and solver',
        'optical simualtion software and implementing relevant numerical simulation techniques',
        'experience in light tracing, scalar or vector diffraction analysis, FDTD, FEM, and optical-related multi-physical-field coupling algorithms or software development',
        'parallel computing frameworks or environments such as OpenMP, MPI, CUDA, and OpenCL',
        'low temperature plasma devices (PVD systems, Hall-effect thrusters/ion sources, and dusty plasmas)',
        'Lead optical engineer for residual radiation management in laser-produced plasma-based EUV sources',
        'optical radiation load, glint, unwanted radiation towards scanne',
        'calculate EUV collection and shaping, laser radiation load, tracing of ballistic particles for contamination prevention',
        'creation of a complex multitool thermo-opto-mechanical model involving ANSYS, ZEMAX, Matlab, SigFit for CO2 laser beam quality in high power mirror system',
        'Matlab for complex simulation result post-processing and for analysis automation',
        'Matlab code for physical optics propagation-based calculation and merged it with Zemax ray tracing engine',
        'Performed various simulation and analysis tasks in the area of high-power IR and EUV light propagation in mirror systems',
        'LED-based illumination optics development (system design, simulation using TracePro and MATLAB)',
        'Development of algorithms for laser printer optics simulation tool',
        'research also in Atomic, Molecular and Optical Physics, Experimental Physics and Optics',
        'Research in the field of Atomic Physics, High Energy Physics, and Fundamental Symmetries',
        'optical engineers and laser scientists developing new technologies for high power lasers, beam delivery, focusing and steering in laser-produced plasma EUV source',
        'led technology development and demonstration of core laser system module enabling high volume EUV manufacturing and future power scaling',
        'major drive laser architectural change for LPP EUV source based on solid state laser systems',
        'implemented EUV source trigger emulator for stand-alone performance qualification of laser system',
        'Developed Matlab toolboxes for analysis of terabytes of EUV source performance data',
        'Led design, build, integration into an EUV sources and performance characterization of novel high-power seed systems based on CO2 MOPA laser architecture',
        'Built complex high power opto-mechanical breadboard systems',
        'laser system for immersion lithography',
        'excimer laser system for immersion lithography',
        'Created system performance breakdown, defined feasible changes for optical, controls, thermal, metrology subsystems ',
        'Optical information processing using nonlinear wave mixing with ultrashort pulse lasers. Heterodyne interferometry using femtosecond pulse lasers',
        'Professional experience in theoretical (physics, mathematics) and computational (mathematical modeling, computational methods)',
        'Development of numerical methods and coding of physical processes for engineering and scientific problems of numerical modeling',
        'Numerical modeling of processes: solid-state heating, process of absorption of laser radiation by a substance of study propagation in different media, diffraction phenomena taking into account the presence of aberrations of the radiation beam in the software package MATLAB.',
        'Formation and conduction of research in the field of laser physics, fiber lasers, interaction of laser radiation with substance',
        'Development and improvement of the mathematical core of the hydrodynamic simulator.',
        'Research of optical properties of materials.',
        'Design, assembly and testing of fiber-optic components, fiber lasers.',
        'Development of instruments and systems based on integrated optics.',
        'Analysis of Mir trends in photonics and integrated optics to prepare proposals for further work.',
    ],
        'sentence_vectors': [],
},



{   'id_resume':3,
    'url': '1',
    'company': '',
    'position_name': 'Radiation Hydrodynamics modeling and simulation  architect',
    'sentences': [
        'radiation hydrodynamics modeling and simulation relating to laser-plasma interactions or discharge-plasma interaction, with profound studies on laser ablation',
        'raidation transport and hydrodynamics, plasma spatiotemporal evolution and resulted recoil presure, and abilitiy of experimental data analysis and modeling validation',
        'Deliver high presicion and high robustness modeling and simulation codes by applying accurate equation of state and opacity data table, resonable assumptions of the complex physical mechanisms, accurate boundary condition difinitions and numerical methods into the simulation architecture',
        'plasma physics, radiation hydrodynamics, liquid hydrodynamics',
        'code writing experience on laser-induced plasma, laser ablation, radiation transport and hydrodynamics',
        'running modeling and simulation of radiation hydrodynamics programs',
        'use of data analysis tools such as Fortran, C++, Matlab, Python',
        'Mathematical & computer modeling',
        'Plasma dynamics modeling',
        'Computation of atomic processes in plasma',
        'EUV and soft X-ray sources modelling',
        'Research of fluid dynamics and conductive fluid dynamics',
        'Fluid dynamo modelling',
        'Fluid dynamics computation',
        'Research and computation of influence of atomic processes in plasma',
        'Computation of atomic processes in non-equilibrium plasma',
        'Plasma surface interaction and optics lifetime',
        'Specialties: X-ray EUV optics, X-ray tomography, multilayer mirrors, X-ray microscopy',
        'Computational methods in the field of high-temperature plasma physics.',
        'numerical modeling of processes in plasma',
        'work in the field of laser-plasma interaction, development of numerical models of physical processes',
    ],
        'sentence_vectors': [],
},
]

In [294]:
#from example_resumes import example_resumes_Sun_Stars
example_resumes = example_resumes_Sun_Stars_positions

In [295]:
for exa_resume in example_resumes:
    sentence_vectors = get_embeddings(sentences=exa_resume['sentences'])
    sentence_embeddings += sentence_vectors
    exa_resume['sentence_vectors'] = sentence_vectors

vectors = []
for sent_vec in sentence_embeddings:
    vectors.append(np.array(sent_vec))

example_resumes

[{'id_resume': 1,
  'url': '1',
  'company': '',
  'position_name': 'Optical Modelling & Simulation Specialist',
  'sentences': ['optical modelling and simulation capability to support the development of ultra high precision optical system',
   'optical simualtion software and implementing relevant numerical simulation techniques.',
   'Matlab, Python or C++.',
   'industrial R&D scientist, physical modeling design engineer',
   'semiconductor optical metrology',
   'researching the fundamental diffraction physics of the light-matter interaction at the nanoscale',
   'image and signal formation physics of the optical sensors',
   'degree in theoretical physics, complemented of academic research experience in the field of fundamental precision measurements',
   'laser interferometric gravitational-wave observatories, and on quantum optomechanics with micromechanical oscillators',
   'classical optics and photonics',
   'optical coherence, aberration theory and imaging',
   'diffraction 

In [296]:
itog = []
with tqdm(total=len(resumes)) as pbar:
    for index, row in resumes.iterrows():
        itog_for_resume = []
        for example_resume in example_resumes:
            itog_for_example = []
            for sentence_vector in row['sentence_vectors']:
                itog_for_sentence = []
                for example_resume_vector in example_resume['sentence_vectors']:
                    sent_sim = cosine_similarity(sentence_vector.reshape(1, -1), example_resume_vector.reshape(1, -1))[0][0]
                    itog_for_sentence.append(sent_sim)
                itog_for_example.append(max(itog_for_sentence))
            itog_for_resume.append(mean(itog_for_example))
        itog_for_resume.append(max(itog_for_resume))
        itog_for_resume.append(row['url'])
        itog_for_resume.append(row['position_name'])
        itog.append(itog_for_resume)
        pbar.update(1)


example_resumes_names = [example_resume['position_name'] for example_resume in example_resumes]
example_resumes_names.append('itog')
example_resumes_names.append('url')
example_resumes_names.append('position name')
result_sim = pd.DataFrame(itog, columns=example_resumes_names)



result_sim = result_sim.sort_values(by='itog', ascending=False)
result_sim

100%|██████████| 164651/164651 [2:34:00<00:00, 17.82it/s]  


Unnamed: 0,Optical Modelling & Simulation Specialist,Optical Algorithm & Simulation Specialist,Radiation Hydrodynamics modeling and simulation architect,itog,url,position name
83106,1.000000,0.640448,0.566705,1.000000,https://hh.ru/resume/811efe5d000570ab9200000b3...,Старший лаборант
162393,0.326282,0.503870,0.945571,0.945571,https://hh.ru/resume/fc3bd9a400011ac6ec00000b3...,"технический инженер, контрактер"
277,0.617468,0.889848,0.570856,0.889848,https://hh.ru/resume/007c8b5f0008d8203c00000b3...,Научный сотрудник
141465,0.848499,0.462937,0.887668,0.887668,https://hh.ru/resume/dbaaad9f0005b201b800000b3...,Лаборант
35263,0.848499,0.462937,0.887668,0.887668,https://hh.ru/resume/37a29f0c0005b5fd4500000b3...,Математик
...,...,...,...,...,...,...
108595,0.018364,0.038480,0.013000,0.038480,https://hh.ru/resume/a910849600039b279b00000b3...,Важатый
160356,0.037643,0.027486,0.029925,0.037643,https://hh.ru/resume/f8eac4c400010bdb9900000b3...,--
60777,0.018600,0.020554,0.027320,0.027320,https://hh.ru/resume/5ed40eee0003e6923e00000b3...,Продавец-кассир
95864,0.022103,0.012583,-0.012081,0.022103,https://hh.ru/resume/94d139af00010ec9fb00000b3...,Старший специалист отдела Информатизации


In [300]:
for_excel = result_sim.iloc[0:100]
for_excel.to_excel('../results/SunStars.xlsx')

In [178]:
pca = PCA(n_components=3)

embeddings_pca = pca.fit_transform(vectors)
vectors_for_vis = pd.DataFrame(embeddings_pca)
vectors_for_vis.to_csv('./vectors.csv')

In [302]:
itog = []
with tqdm(total=len(resumes)) as pbar:
    for index, row in resumes.iterrows():
        itog_for_resume = []
        for example_resume in example_resumes:
            itog_for_example = []
            for example_resume_vector in example_resume['sentence_vectors']:
                itog_for_sentence = []
                for sentence_vector in row['sentence_vectors']:
                    sent_sim = cosine_similarity(sentence_vector.reshape(1, -1), example_resume_vector.reshape(1, -1))[0][0]
                    itog_for_sentence.append(sent_sim)
                itog_for_example.append(max(itog_for_sentence))
            itog_for_resume.append(mean(itog_for_example))
        itog_for_resume.append(max(itog_for_resume))
        itog_for_resume.append(row['url'])
        itog_for_resume.append(row['position_name'])
        itog.append(itog_for_resume)
        pbar.update(1)


example_resumes_names = [example_resume['position_name'] for example_resume in example_resumes]
example_resumes_names.append('itog')
example_resumes_names.append('url')
example_resumes_names.append('position name')
result_sim = pd.DataFrame(itog, columns=example_resumes_names)



result_sim = result_sim.sort_values(by='itog', ascending=False)
result_sim

100%|██████████| 164651/164651 [2:42:02<00:00, 16.93it/s]  


Unnamed: 0,Optical Modelling & Simulation Specialist,Optical Algorithm & Simulation Specialist,Radiation Hydrodynamics modeling and simulation architect,itog,url,position name
76491,0.157867,0.177073,0.501020,0.501020,https://hh.ru/resume/76e1be8a0002e02af300000b3...,Младший научный сотрудник
146947,0.187240,0.234031,0.472483,0.472483,https://hh.ru/resume/e3ee662c00045c884400000b3...,"technical Engineer, contractor"
162393,0.170400,0.199586,0.465338,0.465338,https://hh.ru/resume/fc3bd9a400011ac6ec00000b3...,"технический инженер, контрактер"
162394,0.360598,0.345168,0.460314,0.460314,https://hh.ru/resume/fc3bd9a400011ac6ec00000b3...,старший научный сотрудник
18424,0.167696,0.210512,0.459608,0.459608,https://hh.ru/resume/1c83ca77000713241100000b3...,лаборант исследователь
...,...,...,...,...,...,...
108595,-0.066907,-0.058559,-0.056672,-0.056672,https://hh.ru/resume/a910849600039b279b00000b3...,Важатый
40355,-0.093816,-0.077256,-0.058526,-0.058526,https://hh.ru/resume/3f7850870003e00a5e00000b3...,Член бригады ресторана
95864,-0.058750,-0.072122,-0.096078,-0.058750,https://hh.ru/resume/94d139af00010ec9fb00000b3...,Старший специалист отдела Информатизации
164628,-0.093667,-0.086539,-0.059957,-0.059957,https://hh.ru/resume/fffac4d10008b9842c00000b3...,Преподаватель Python
