In [96]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn import decomposition, ensemble
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from time import time
from sklearn.metrics import fbeta_score, accuracy_score
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as pl
import matplotlib.patches as mpatches
from sklearn import preprocessing
from nltk.stem.porter import *
import nltk
import scipy as sp
from sklearn import random_projection
from sklearn.externals import joblib
from nltk.stem.snowball import SnowballStemmer
from sklearn.decomposition import TruncatedSVD
from nltk import ngrams
from google.cloud import translate
import urllib
import time
from google.oauth2 import service_account


import pandas as pd
import numpy as np
import string
from gensim.models import word2vec
import re
import datetime

In [108]:
#BASIC PREPROCESSING
def basic_cleaner(text, language=None, skill_match = None):
    """
    Purpose of this function to clean the appropriate texts 
    with language specific stemming
    """
    
    if language == None or language=='en':
        stemmer = PorterStemmer()
    elif language == 'sv':
        stemmer = SnowballStemmer("swedish")
    elif language == 'da':
        stemmer = SnowballStemmer("danish")
    elif language == 'fi':
        stemmer = SnowballStemmer("finnish")
        
    if skill_match == None:
        if language == None or language=='en':
            text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        else:
            text = re.sub(r'[^\w]', ' ', text)
    
    text = text.lower()
    text = text.strip()
    text = wpt.tokenize(text)
    # filter stopwords out of document
    text = [stemmer.stem(token) for token in text]
    # re-create document from filtered tokens
    text = ' '.join(text)

    
    return text

In [109]:
def english_main_classifier(dataframe, job_title, job_desc):
    """
    Purpose of this function to classify english job roles to the ESCO taxonomy
    """
    #Preprocessing
    print('cleaning of texts started : {}'.format(datetime.datetime.now()))
    dataframe[job_title + '_cln'] = dataframe[job_title].apply(basic_cleaner)
    dataframe[job_desc + '_cln'] = dataframe[job_desc].apply(basic_cleaner)
    print('cleaning of texts finished : {}'.format(datetime.datetime.now()))
    
    #Generating features from job roles - loading already fitted tfidf
    print('vectorization, and random projection started : {}'.format(datetime.datetime.now()))
    tfidf_role = joblib.load('tf_idf_role')
    tf_idf_res_role = tfidf_role.transform(dataframe[job_title + '_cln'])
    
    #Generating features from job descriptions - loading already fitted tfidf
    tfidf_desc = joblib.load('tf_idf_desc')
    tf_idf_res_desc = tfidf_desc.transform(dataframe[job_desc + '_cln'])
    
    #applying Random Projection, an unsupervised learning method for dimensionality reduction
    rp = joblib.load('rp')
    tfidf_desc_rand_proj = rp.transform(tf_idf_res_desc)
    
    #combining feature matrixes
    tf_idf_final = sp.sparse.hstack((tf_idf_res_role, tfidf_desc_rand_proj))
    
    print('vectorization, and random projection finished : {}'.format(datetime.datetime.now()))
    
    #loading trained SGDClassifier
    print('classification started : {}'.format(datetime.datetime.now()))
    clf = joblib.load('english_classifier_joblib')
    label_encoder = joblib.load('label_encoder')
    
    prediction = clf.predict(tf_idf_final)
    prediction = pd.DataFrame(label_encoder.inverse_transform(prediction),
                              columns=['prediction'])
    
    
    #Assigning probability to the respective predictions
    predicted_prob = pd.DataFrame(clf.predict_proba(tf_idf_final))
    predicted_prob['max_prob'] = predicted_prob.apply(lambda prob: max(prob), axis=1)
    
    dataframe.reset_index(inplace=True)
    del(dataframe['index'])
    
    #Finalizing results
    dataframe = pd.merge(dataframe, prediction, how='left', left_index=True, right_index=True)
    dataframe = pd.merge(dataframe, pd.DataFrame(predicted_prob['max_prob']), 
                         how='left', left_index=True, right_index=True)
    
    print('classification finished : {}'.format(datetime.datetime.now()))
    return dataframe
    
    


In [110]:
def identify_language(text):
    """
    Purpose of this function to detect the foreing languages 
    in the text
    """
    lang = translate_client.detect_language(text)
    time.sleep(1)
    return lang['language']

In [111]:
def translate_text(text, language):
    """
    Function for using Cloud Based Google translate
    """
    

    translation = translate_client.translate(
        text,
        source_language=language,
        target_language='en')

    return translation['translatedText']

In [112]:
def multi_lang_classifier(dataframe, job_title, job_desc, language):
    """
    Purpose of this function to classify non english job roles to ESCO taxonomy
    """
    
    
    #translating foreign positions to english
    positions = pd.DataFrame(dataframe[job_title].drop_duplicates())
    print('translation started : {}'.format(datetime.datetime.now()))
    positions[job_title + '_translated'] = positions[job_title].apply(translate_text, language = language)
    print('translation finished : {}'.format(datetime.datetime.now()))
    
    
    print('cleaning and vectorization started : {}'.format(datetime.datetime.now()))
    dataframe = pd.merge(dataframe, positions, on=job_title, how='left')
    
    #Cleaning and creating feature matrixes
    dataframe[job_title +'_translated_cln'] = dataframe[job_title + '_translated'].apply(basic_cleaner)
    
    tf_idf_role = joblib.load('multi_lang_tfidf')
    tf_idf_final = tf_idf_role.transform(dataframe[job_title +'_translated_cln'])
    print('cleaning and vectorization finished : {}'.format(datetime.datetime.now()))
    
    #Prediction with english trained model on now translated positions
    print('classification started : {}'.format(datetime.datetime.now()))
    clf = joblib.load('multi_lang_classifier_joblib')
    label_encoder = joblib.load('label_encoder')
    
    prediction = clf.predict(tf_idf_final)
    prediction = pd.DataFrame(label_encoder.inverse_transform(prediction), columns=['prediction'])
    
    
    #assigning probabilities and joining together results
    predicted_prob = pd.DataFrame(clf.predict_proba(tf_idf_final))
    predicted_prob['max_prob'] = predicted_prob.apply(lambda prob: max(prob), axis=1)
    
    dataframe.reset_index(inplace=True)
    del(dataframe['index'])
    
    
    dataframe = pd.merge(dataframe, prediction, how='left', left_index=True, right_index=True)
    dataframe = pd.merge(dataframe, pd.DataFrame(predicted_prob['max_prob']), how='left', left_index=True, right_index=True)
    
    print('classification finished : {}'.format(datetime.datetime.now()))
    
    return dataframe

In [113]:
def match_skills(dataframe, description, 
                 job_id, skills_one_token,
                 skills_two_token, skills_three_token, 
                 skills_four_token):
    
    #Matching ESCO skills with same token length
    unigram = ngrams(dataframe[description].split(), 1)
    unigram = set([' '.join(grams) for grams in unigram])
    bigram = set(ngrams(dataframe[description].split(), 2))
    bigram = set([' '.join(grams) for grams in bigram])
    trigram = set(ngrams(dataframe[description].split(), 3))
    trigram = set([' '.join(grams) for grams in trigram])
    fourgram = set(ngrams(dataframe[description].split(), 4))
    fourgram = set([' '.join(grams) for grams in fourgram])

    common_unigram= list(unigram.intersection(skills_one_token))
    common_bigram = list(bigram.intersection(skills_two_token))
    common_trigram = list(trigram.intersection(skills_three_token))
    common_fourgram = list(fourgram.intersection(skills_four_token))
    
    all_skill = [common_unigram, common_bigram, common_trigram, common_fourgram]

    all_skill = [[dataframe[job_id], item] for sublist in all_skill for item in sublist]

    if len(common_unigram) >= 1 or len(common_bigram) >= 1 or len(common_trigram) >= 1 or len(common_fourgram) >=1:
        return all_skill
    else:
        return []

In [114]:
def match_skills_base(dataframe,job_id, job_desc, language):
    
    ###put in language specific cleaning for descriptions
    print('cleaning started : {}'.format(datetime.datetime.now()))
    
    dataframe[job_desc + '_cln'] = dataframe[job_desc].apply(basic_cleaner, language=language, skill_match=True)
    print('cleaning finished : {}'.format(datetime.datetime.now()))
    
    print('skill preparation started : {}'.format(datetime.datetime.now()))
    
    skills_need= pd.read_excel('skills_check_04.xlsx')
    skills_need = skills_need[skills_need['need']==1]
    
    if language=='en':
        skills_need_list = pd.DataFrame(list(skills_need['preferred_skill_label']), columns=['skill_desc'])
    else:
        skills_need_list = pd.DataFrame(list(skills_need['preferred_skill_label_' + language]), columns=['skill_desc'])
    
    
    skills_need_list.fillna('mv-9', inplace=True)
    
    skills_need_list = skills_need_list[skills_need_list['skill_desc']!='mv-9'].copy()
    
    skills_need_list['skill_desc_cln'] = skills_need_list['skill_desc'].apply(basic_cleaner, language=language, skill_match=True)
    
    skills_need_list['token_count'] = skills_need_list['skill_desc_cln'].apply(lambda text: len(wpt.tokenize(text)))
    
    skills_four_token = skills_need_list['skill_desc_cln'][skills_need_list['token_count']==4].copy()
    
    skills_three_token = skills_need_list['skill_desc_cln'][skills_need_list['token_count']==3].copy()
    
    skills_two_token = skills_need_list['skill_desc_cln'][skills_need_list['token_count']==2].copy()
    
    skills_one_token = skills_need_list['skill_desc_cln'][skills_need_list['token_count']==1].copy()
    
    
    
    print('skill preparation finished : {}'.format(datetime.datetime.now()))
    
    all_skills_complete=[]
    
    
    print('skill matching started : {}'.format(datetime.datetime.now()))
    all_skills_complete.append(dataframe.apply(match_skills, description=job_desc + '_cln' , 
                                               job_id=job_id,
                                               skills_one_token=skills_one_token,
                                               skills_two_token=skills_two_token,
                                               skills_three_token=skills_three_token,
                                               skills_four_token=skills_four_token,
                                               axis=1))
    print('skill matching finished : {}'.format(datetime.datetime.now()))
    
    print('finalization  started : {}'.format(datetime.datetime.now()))
    flat_list = [item for sublist in all_skills_complete for item in sublist]
    flat_list = [item for sublist in flat_list for item in sublist]
    
    all_skills_complete = pd.DataFrame(flat_list, columns=[job_id, 'skill_desc_cln'])
    
    all_skills_complete = pd.merge(all_skills_complete, skills_need_list[['skill_desc', 'skill_desc_cln']],
                               on='skill_desc_cln', how='left')
    if language == 'en':
        all_skills_complete.rename(columns={'skill_desc' : 'preferred_skill_label'},  inplace=True)
        all_skills_complete = pd.merge(all_skills_complete, skills_need[['preferred_skill_label', 'skillUri']],
                               on='preferred_skill_label', how='left')
        
    else:
        all_skills_complete.rename(columns={'skill_desc' : 'preferred_skill_label_' + language},  inplace=True)
    
        all_skills_complete = pd.merge(all_skills_complete, skills_need[['preferred_skill_label_' + language,
                                                                     'preferred_skill_label',
                                                                     'skillUri']],
                               on='preferred_skill_label_' + language, how='left')
        
    
    
    
    data_skill_matched = pd.merge(dataframe, all_skills_complete, on=job_id, how='left')
    print('finalization  finished : {}'.format(datetime.datetime.now()))
    
    return data_skill_matched

In [116]:
def swedish_location_correcter(df):
    if df['text3'] == 'mv-9':
        return df['text2']
    else:
        return df['text3']

In [2]:
#loading language specific stopwords for multilingual matching
wpt = nltk.WordPunctTokenizer()
stop_words_en = nltk.corpus.stopwords.words('english')
stop_words_sv = nltk.corpus.stopwords.words('swedish')
stop_words_fi = nltk.corpus.stopwords.words('finnish')

stop_words_en += ['job', 'title', 'position', 'description']

In [73]:
#Load credentials for cloud based Google Translate
credentials = service_account.Credentials.from_service_account_file("My First Project-fab01784b0d3.json")
scoped_credentials = credentials.with_scopes(
    ['https://www.googleapis.com/auth/cloud-platform'])

In [74]:
#instantiating translator
translate_client = translate.Client(credentials=credentials)

In [58]:
#Read in input data
data_fi = pd.read_excel('FI_FULL.xlsx')
data_sv = pd.read_excel('SE_FULL2018-10-30.xlsx')
data_dk = pd.read_excel('DA_FULL_2018_11_01.xlsx')

In [59]:
#swedish data preparation
splitted = data_sv['company'].str.split('-', expand=True)
splitted.columns=['text1', 'text2', 'text3']
splitted.fillna('mv-9', inplace=True)
splitted['location'] = splitted.apply(swedish_location_correcter, axis=1)
data_sv = pd.merge(data_sv, pd.DataFrame(splitted['location']), left_index=True, right_index=True)

In [64]:
#Joining multi language data
data = pd.concat([data_fi, data_sv, data_dk], ignore_index=True)

In [66]:
#assigning unique id for foles
data['job_id'] = data['position'] + ':' + data['url']

In [67]:
#handling NULL values
data.fillna('mv-9', inplace=True)

In [92]:
#rationalising description length for language identification
data['description_reduced'] = data['description'].apply(lambda text: text[0:100])

In [98]:
#calling language identifier API
print('lang identification started : {}'.format(datetime.datetime.now()))
data['language'] = data['description_reduced'].apply(identify_language)
print('lang identification finished : {}'.format(datetime.datetime.now()))

lang identification started : 2018-11-01 15:41:19.600228
lang identification finished : 2018-11-01 16:18:28.565160


In [100]:
#Load in validated language identification results
data = pd.read_excel('datamodel_check_lang_detection_2018_11_01_v2.xlsx')

In [103]:
#unique list of languages in data
languages = list(data['language'].drop_duplicates())

In [104]:
###Language specific classification process
class_all = pd.DataFrame()

for lang in languages:
    print('current language {}'.format(lang))    

    if lang == 'en':
        data_need = data[data['language']=='en'].copy()
        data_class = english_main_classifier(data_need, 'position', 'description')
        class_all = pd.concat([class_all, data_class])
    else:
        data_need = data[data['language']==lang].copy()
        data_class = multi_lang_classifier(data_need, 'position', 'description', lang)
        class_all = pd.concat([class_all, data_class])
        

current language fi
translation started : 2018-11-01 17:25:37.630153
translation finished : 2018-11-01 17:26:47.799157
cleaning and vectorization started : 2018-11-01 17:26:47.800156
cleaning and vectorization finished : 2018-11-01 17:26:48.007573

  if diff:



classification started : 2018-11-01 17:26:48.007573
classification finished : 2018-11-01 17:26:48.216744
current language en
cleaning of texts started : 2018-11-01 17:26:48.222729
cleaning of texts finished : 2018-11-01 17:26:54.750623
vectorization, and random projection started : 2018-11-01 17:26:54.750623
vectorization, and random projection finished : 2018-11-01 17:27:27.030952
classification started : 2018-11-01 17:27:27.031951


  if diff:


classification finished : 2018-11-01 17:27:27.398056
current language da
translation started : 2018-11-01 17:27:27.645904
translation finished : 2018-11-01 17:29:06.744565
cleaning and vectorization started : 2018-11-01 17:29:06.745553
cleaning and vectorization finished : 2018-11-01 17:29:06.954422
classification started : 2018-11-01 17:29:06.954422
classification finished : 2018-11-01 17:29:06.973421
current language sv
translation started : 2018-11-01 17:29:06.983415


  if diff:


translation finished : 2018-11-01 17:35:09.167860
cleaning and vectorization started : 2018-11-01 17:35:09.167860
cleaning and vectorization finished : 2018-11-01 17:35:09.454682
classification started : 2018-11-01 17:35:09.454682
classification finished : 2018-11-01 17:35:09.493658


  if diff:


In [9]:
#class_all.to_excel('live_demo_01.xlsx')
#class_all = pd.read_excel('process_check_01.xlsx')

In [118]:
skill_matched_all = pd.DataFrame()

In [119]:
#Language specific skill extraction from job descriptions
for lang in languages:
    print(lang)
    data_need = class_all[class_all['language']==lang].copy()
    data_skill_matched = match_skills_base(data_need, 'job_id', 'description', lang)
    skill_matched_all = pd.concat([skill_matched_all, data_skill_matched])

fi
cleaning started : 2018-11-01 17:57:35.276916
cleaning finished : 2018-11-01 17:57:37.645506
skill preparation started : 2018-11-01 17:57:37.645506
skill preparation finished : 2018-11-01 17:57:37.732459
skill matching started : 2018-11-01 17:57:37.732459
skill matching finished : 2018-11-01 17:57:38.294141
finalization  started : 2018-11-01 17:57:38.294141
finalization  finished : 2018-11-01 17:57:38.313758
en
cleaning started : 2018-11-01 17:57:38.316756
cleaning finished : 2018-11-01 17:57:44.626870
skill preparation started : 2018-11-01 17:57:44.626870
skill preparation finished : 2018-11-01 17:57:44.715810
skill matching started : 2018-11-01 17:57:44.715810
skill matching finished : 2018-11-01 17:57:45.880092
finalization  started : 2018-11-01 17:57:45.880092
finalization  finished : 2018-11-01 17:57:45.893085
da
cleaning started : 2018-11-01 17:57:45.899081
cleaning finished : 2018-11-01 17:57:47.493099
skill preparation started : 2018-11-01 17:57:47.493099
skill preparation f

In [121]:
#extract processed and enriched data
skill_matched_all.to_excel('live_demo_full_01.xlsx')