# Word2Vec - oferty pracy

In [1]:
import pandas as pd

from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases

from ast import literal_eval

In [2]:
ls -lh data

total 247M
-rwxr--r-- 1 cherit users 128M Sep  4 22:53 [0m[01;32mjob_ofer.csv[0m*
-rwxr--r-- 1 cherit users  38M Sep  4 22:54 [01;32mjob_ofer.csv.tar.gz[0m*
-rwxr--r-- 1 cherit users  83M Sep  3 10:30 [01;32mreviews_data.txt.gz[0m*


In [3]:
df = pd.read_csv('data/job_ofer.csv')

In [4]:
df.shape

(36109, 8)

In [5]:
df.head()

Unnamed: 0,title,company_name,address,description,seniority_level,employment_type,job_function,industries
0,Machine Learning Engineer,Intellipro Group Inc,"Palo Alto, CA, US","['About The Company', ""W*** is reshaping the f...",Entry level,Full-time,Engineering,Information Technology and Services
1,Deep Learning Applied Researcher - Chicago,Ethosia,"Chicago, IL, US","['תיאור המשרה', 'Deep learning for Computer Vi...",Associate,Full-time,Other,Information Technology and Services
2,Machine Learning Engineer,Motorola Solutions,"Chicago, IL, US","['Company Overview', 'At Motorola Solutions, w...",Entry level,Full-time,Engineering,Information Technology and Services
3,Machine Learning / Data Scientist,Proprius LLC,"San Francisco, CA, US",['Our client is a digital invention agency foc...,Entry level,Full-time,Engineering,Information Technology and Services
4,Cloud Architect,TCS,"Framingham, Massachusetts, United States","['Technical/Functional Skills', ' ', 'Good to ...",Mid-Senior level,Full-time,Engineering,Information Technology and Services


# Word2Vec

In [6]:
corpus = df['title'].map(simple_preprocess)

In [7]:
corpus

0                            [machine, learning, engineer]
1           [deep, learning, applied, researcher, chicago]
2                            [machine, learning, engineer]
3                     [machine, learning, data, scientist]
4                                       [cloud, architect]
5                                        [data, scientist]
6                                     [store, room, clerk]
7                              [director, of, product, us]
8               [recruiting, manager, ad, census, ext, gb]
9        [bilingual, engineer, german, germany, or, swi...
10                                             [sommelier]
11       [entry, level, project, manager, shelton, ct, ...
12       [finance, manager, firestone, industrial, prod...
13        [us, lcra, cardiovascular, remote, anywhere, in]
14          [gallagher, bassett, corporate, intern, legal]
15                               [us, head, of, marketing]
16                                  [visual, merchandise

In [8]:
model = Word2Vec(corpus, size=100, window=2, min_count=1)

In [9]:
model.wv.most_similar('machine')

[('deep', 0.9483935832977295),
 ('edge', 0.8585814237594604),
 ('scientists', 0.8576443195343018),
 ('inference', 0.846896767616272),
 ('acceleration', 0.8403028249740601),
 ('computer', 0.8299179077148438),
 ('big', 0.8258348703384399),
 ('three', 0.8208028078079224),
 ('captivate', 0.8207422494888306),
 ('genomic', 0.81876140832901)]

# Przykład

In [10]:
corpus = [
    ['a', 'b', 'c'],
    ['a', 'b', 'x'],
    ['y','a', 'b', 'w'],
    ['q','a', 'b', 'u','k'],
]
bigram = Phraser(Phrases(corpus, min_count=1, threshold=1))
bigram[['k','a','b','c']]

['k', 'a_b', 'c']

# Title + phrases

In [11]:
title_corpus = df['title'].map(simple_preprocess)
title_bigram = Phraser(Phrases(title_corpus, min_count=1, threshold=1))


In [12]:
title_corpus_phrase = [title_bigram[sent] for sent in title_corpus]
model = Word2Vec(title_corpus_phrase, size=100, window=2, min_count=1)

In [13]:
model.wv.most_similar('machine')

[('bc', 0.9664468765258789),
 ('technology_services', 0.9651788473129272),
 ('nyc_new', 0.9632802605628967),
 ('advisor_ontario', 0.9632721543312073),
 ('capital', 0.9631308913230896),
 ('paralegal', 0.9625399112701416),
 ('acquisition', 0.9624183177947998),
 ('human_resource', 0.9618913531303406),
 ('soho', 0.961529016494751),
 ('officer_singapore', 0.9614496231079102)]

In [14]:
def prepare_corpus(corpus, bigram):
    for sent in corpus:
        yield bigram[sent] + sent

In [15]:
ext_corp = list(prepare_corpus(title_corpus, title_bigram))
title_model = Word2Vec(ext_corp, size=100, window=2, min_count=1)

In [16]:
title_model.wv.most_similar('machine_learning')

[('rockville', 0.9911437034606934),
 ('computer_vision', 0.9902132749557495),
 ('learning', 0.9825021028518677),
 ('artificial_intelligence', 0.976567268371582),
 ('deep_learning', 0.9759215712547302),
 ('nlp', 0.9733071327209473),
 ('data_engineer', 0.969058632850647),
 ('deep', 0.9676706790924072),
 ('mining', 0.9637101888656616),
 ('ml', 0.9621261358261108)]

# Description

In [17]:
simple_preprocess(df.sample()['description'].values[0])

['leading',
 'global',
 'data',
 'and',
 'analytics',
 'company',
 'is',
 'initiating',
 'transformation',
 'of',
 'their',
 'cs',
 'department',
 'to',
 'work',
 'strategically',
 'with',
 'their',
 'clients',
 'across',
 'some',
 'of',
 'the',
 'largest',
 'banks',
 'financial',
 'institutions',
 'healthcare',
 'companies',
 'and',
 'other',
 'global',
 'corporations',
 'as',
 'such',
 'they',
 'are',
 'looking',
 'to',
 'hire',
 'director',
 'of',
 'customer',
 'success',
 'this',
 'company',
 'product',
 'is',
 'one',
 'of',
 'the',
 'only',
 'open',
 'end',
 'solutions',
 'that',
 'allows',
 'customers',
 'to',
 'discover',
 'and',
 'extract',
 'value',
 'out',
 'their',
 'data',
 'that',
 'currently',
 'available',
 'on',
 'the',
 'market',
 'this',
 'saas',
 'technology',
 'is',
 'changing',
 'the',
 'way',
 'multiple',
 'departments',
 'across',
 'the',
 'fortune',
 'industry',
 'utilize',
 'their',
 'data',
 'to',
 'make',
 'transformative',
 'and',
 'impactful',
 'advancement

In [18]:
descr_corpus = df['description'].map(simple_preprocess)
descr_bigram = Phraser(Phrases(title_corpus, min_count=1, threshold=1))

In [19]:
# ext_corp = list(prepare_corpus(descr_corpus, descr_bigram))
descr_model = Word2Vec(ext_corp, size=100, window=2, min_count=1)

In [20]:
descr_model.wv.most_similar('python')

[('python_developer', 0.9856449961662292),
 ('senior_react', 0.9807460904121399),
 ('react', 0.9772253632545471),
 ('ios', 0.9742727279663086),
 ('exadata', 0.9733853340148926),
 ('core', 0.9715838432312012),
 ('android', 0.9713596701622009),
 ('java', 0.9712833166122437),
 ('servicenow', 0.9708343148231506),
 ('database', 0.9693511724472046)]

In [21]:
title_model.wv.most_similar('python')

[('java', 0.9791979789733887),
 ('android', 0.9709876775741577),
 ('backend', 0.9688427448272705),
 ('ios', 0.9669768810272217),
 ('sfdc', 0.9667401909828186),
 ('computer_vision', 0.9659461975097656),
 ('sr_java', 0.962591826915741),
 ('react', 0.9625830054283142),
 ('core', 0.9623101949691772),
 ('database', 0.9612717628479004)]

In [22]:
df.sample()['description'].map(literal_eval).values

array([list(['Position ID: M-13-B03', 'Position/Project Name: Mission Capability Systems Engineer', 'The Mission (Technology Insertion) Capability Engineer (MCE) will provide engineering support for the NGA FAST Trak Missionization initiative supporting NGA/TD GEOINT Services Office. The MCE will lead the integration of traditional, non-traditional, and emerging innovative solutions through Platform as a Service (PaaS) implementation in cloud environments. The MCE will assess Enterprise CONOPS and work flows using agile SE&I processes to refine mission needs into implementable GEOINT capabilities, services, and technology insertions, and, enable new ways to effectively satisfy mission needs. The MCE ensures the implementation for specific mission capability projects to achieve established objectives. The MCE provides requisite capabilities and services to meet mission and GEOINT community user needs and complies with the enterprise technical architecture and security standards, while k