# Job descriptions clustering

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
import mglearn

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
data = pd.read_csv('../data/data job posts.csv')

In [23]:
data.head(2)

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False


In [4]:
data['jobpost'].isnull().any()

False

In [7]:
data = data.drop_duplicates('jobpost')

In [8]:
data.duplicated('jobpost').any()

False

## Vectorization

In [9]:
def tokenizer(text):
    return [lemmatizer.lemmatize(w.lower()) for sent in sent_tokenize(text) for w in word_tokenize(sent) if w.isalpha() and len(w) > 2]

In [16]:
cv = CountVectorizer(stop_words='english', min_df=2, max_df=0.5, max_features=20000, tokenizer=lambda text:tokenizer(text))
X = cv.fit_transform(data['jobpost'])

In [10]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.5, max_features=20000, tokenizer=lambda text:tokenizer(text))
X_tf = tfidf.fit_transform(data['jobpost'])

## LDA

In [17]:
lda = LatentDirichletAllocation(learning_method="online", max_iter=30, n_components=30)
topics = lda.fit_transform(X)

In [18]:
for i, topic in enumerate(topics[:50]):
    print('main topic: {} \njob title: {}'.format(np.argmax(topic), data['Title'][i]))
    print()

topic: 11 
job title: Chief Financial Officer

topic: 5 
job title: Full-time Community Connections Intern (paid internship)

topic: 0 
job title: Country Coordinator

topic: 20 
job title: BCC Specialist

topic: 18 
job title: Software Developer

topic: 13 
job title: Saleswoman

topic: 12 
job title: Chief Accountant/ Finance Assistant

topic: 5 
job title: Non-paid part or full time Programmatic Intern

topic: 10 
job title: Assistant to Managing Director

topic: 6 
job title: Program Assistant (INL), FSN-8; FP-6*

topic: 5 
job title: Short-Term Travel Grants (STG) Program

topic: 5 
job title: Non-paid part or full time Administrative Intern

topic: 11 
job title: Chief of Party (COP)

topic: 20 
job title: Community Development, Capacity Building and Conflict

topic: 11 
job title: General Manager

topic: 17 
job title: Network Administrator

topic: 13 
job title: Utopian World Championship 2004

topic: 20 
job title: Country Economist (NOB)

topic: 20 
job title: Driver/ Logisti

KeyError: 42

In [47]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
features = np.array(cv.get_feature_names())

mglearn.tools.print_topics(topics=range(30), feature_names=features
sorting=sorting, topics_per_chunk=5, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
medical       office        recruitment   management    marketing     
representativeadministrativefashion       ensure        market        
pharmaceuticalassistant     fao           business      business      
visit         document      official      process       development   
doctor        maintain      curriculum    manage        develop       
medicine      duty          contact       manager       product       
office        provide       vitae         plan          strategy      
healthcare    assist        distributor   develop       plan          
university    perform       zeppelin      activity      research      
interpersonal meeting       information   performance   new           
information   staff         adviser       staff         activity      
regular       correspondencearka          policy        advertising   
le    

## KMeans

In [11]:
km_10 = KMeans(n_clusters=15)
km_10.fit(X_tf)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=15, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [24]:
order_centroids = km_10.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names()
for i in range(15):
    print('Cluster {}:'.format(i))
    for ind in order_centroids[i, :30]:
        print(' {}'.format(terms[ind]), end='')
    print()

Cluster 0:
 accounting accountant tax financial chief report prepare account finance legislation payment cash bank transaction perform software duty statement standard monthly control related reporting maintain llc submit payable office cjsc annual
Cluster 1:
 office llc education responsible send service manager information administrative field higher computer management relevant international open assistant business cjsc design team customer university plus line provide resume email long training
Cluster 2:
 network server database technical administrator security support data sql hardware window administration software service information equipment maintenance responsible infrastructure technology installation computer cjsc install management configuration maintain field operation backup
Cluster 3:
 medical pharmaceutical representative doctor medicine pharmacist product pharmacy visit hospital drug photo organize servier presentation regular laboratoires pay clinic promote marketin