# Job descriptions clustering

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
import mglearn

In [2]:
lemmatizer = WordNetLemmatizer()

In [3]:
data = pd.read_csv('../data/data job posts.csv')

In [23]:
data.head(2)

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False


In [4]:
data['jobpost'].isnull().any()

False

In [5]:
data = data.drop_duplicates('jobpost')

In [6]:
data.duplicated('jobpost').any()

False

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(data["jobpost"], data["Title"], test_size=0.1)

## Vectorization

In [42]:
def tokenizer(text):
    return [lemmatizer.lemmatize(w.lower()) for sent in sent_tokenize(text) for w in word_tokenize(sent) if w.isalpha() and len(w) > 2]

In [43]:
cv = CountVectorizer(stop_words='english', min_df=2, max_df=0.5, max_features=20000, tokenizer=lambda text:tokenizer(text))
X = cv.fit_transform(data['jobpost'])

In [35]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.5, max_features=20000, tokenizer=lambda text:tokenizer(text))
X_tf = tfidf.fit_transform(data['jobpost'])

## LDA

In [44]:
lda = LatentDirichletAllocation(learning_method="online", max_iter=30, n_components=30)
topics = lda.fit_transform(X)

In [17]:
for i, topic in enumerate(topics[:50]):
    print('topic: {} \njob title: {}'.format(np.argmax(topic), data['Title'][i]))
    print()

topic probabilities: 0 
job title: Chief Financial Officer

topic probabilities: 2 
job title: Full-time Community Connections Intern (paid internship)

topic probabilities: 6 
job title: Country Coordinator

topic probabilities: 6 
job title: BCC Specialist

topic probabilities: 3 
job title: Software Developer

topic probabilities: 8 
job title: Saleswoman

topic probabilities: 0 
job title: Chief Accountant/ Finance Assistant

topic probabilities: 2 
job title: Non-paid part or full time Programmatic Intern

topic probabilities: 8 
job title: Assistant to Managing Director

topic probabilities: 1 
job title: Program Assistant (INL), FSN-8; FP-6*

topic probabilities: 2 
job title: Short-Term Travel Grants (STG) Program

topic probabilities: 2 
job title: Non-paid part or full time Administrative Intern

topic probabilities: 5 
job title: Chief of Party (COP)

topic probabilities: 6 
job title: Community Development, Capacity Building and Conflict

topic probabilities: 8 
job title: 

KeyError: 42

In [28]:
features = cv.get_feature_names()
sortings = np.argsort(lda.components_, axis=1)[:, ::-1]
for i, sort in enumerate(sortings):
    print('Topic {}'.format(i))
    print('--------------')
    for indx in sort[:30]:
        print(features[indx])

Topic 0
--------------
attitude
confirming
attracted
expend
laghajanyan
ccm
beijing
artsakhbank
evaluatior
continually
sapling
census
shelf
qas
specialty
managing
oldest
portfolio
kahavorum
scotland
frenchise
ucs
recruitarmenia
aptitude
evangelism
analysing
vxsoft
rtos
detached
discarded
Topic 1
--------------
sdas
continually
lover
shelf
qsquantum
kurt
lotus
loaning
bing
rtos
census
telegate
eunp
cyprus
beirut
loan
haymamul
ncfas
ccm
klaus
loaded
advertises
stornquist
predominant
streamlined
oldest
struggling
polishing
shahamiryans
preselling
Topic 2
--------------
sold
daban
curatorial
czech
webcall
testcompleat
teamviewer
tevaresume
technolinguistics
ea
rabbitmq
quite
tectonics
inconcept
kurt
sra
chance
coorditation
preselling
qsquantum
shahinyan
cyprus
som
biomass
postion
reconfirm
toolkit
synthetic
flooding
shelf
Topic 3
--------------
edication
breakpoint
ophthalmologic
araratfood
chemoincs
neonatal
archivation
enroll
rome
glow
optimising
episode
democratization
wva
digging
downt

In [47]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(cv.get_feature_names())

mglearn.tools.print_topics(topics=range(30), feature_names=feature_names,
sorting=sorting, topics_per_chunk=5, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
medical       office        recruitment   management    marketing     
representativeadministrativefashion       ensure        market        
pharmaceuticalassistant     fao           business      business      
visit         document      official      process       development   
doctor        maintain      curriculum    manage        develop       
medicine      duty          contact       manager       product       
office        provide       vitae         plan          strategy      
healthcare    assist        distributor   develop       plan          
university    perform       zeppelin      activity      research      
interpersonal meeting       information   performance   new           
information   staff         adviser       staff         activity      
regular       correspondencearka          policy        advertising   
le    

## KMeans

In [40]:
km_10 = KMeans(n_clusters=10)
km_10.fit(X_tf)

KeyboardInterrupt: 

In [39]:
order_centroids = km_10.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names()
for i in range(10):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :30]:
        print(' %s' % terms[ind], end='')
    print()

AttributeError: 'KMeans' object has no attribute 'cluster_centers_'

In [36]:
km_5 = KMeans(n_clusters=5)
km_5.fit(X_tf)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [38]:
order_centroids = km_5.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: accounting financial accountant tax finance report chief prepare account legislation
Cluster 1: software development developer design web test team testing engineer java
Cluster 2: project program development management training international implementation activity community office
Cluster 3: sale customer marketing llc service responsible manager product office business
Cluster 4: bank credit banking loan cjsc form financial branch customer attachment
