In [1]:
from selenium import webdriver
from time import sleep
from scrapy.selector import Selector
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from selenium.webdriver.chrome.options import Options
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

chrome_options = Options()
chrome_options.add_argument("--headless")


chromedriver = "/home/btan/Documents/chromedriver"
# driver = webdriver.Chrome(chromedriver, chrome_options = chrome_options)
# driver.close()

pd.set_option('mode.chained_assignment', None)

In [2]:
##############################################################
# section 2: determine factors that impact salary
##############################################################

In [3]:
jobs = pd.read_csv('./job_openings_2019-05-05.csv').drop(columns='Unnamed: 0')

In [4]:
jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1093 entries, 0 to 1092
Data columns (total 16 columns):
job_id              1093 non-null object
job_title           1093 non-null object
company_name        1093 non-null object
company_address     897 non-null object
job_category        1093 non-null object
job_url             1093 non-null object
job_description     914 non-null object
job_requirements    789 non-null object
employment_type     1093 non-null object
seniority           1093 non-null object
job_skills          1093 non-null object
pay_min             1011 non-null object
pay_max             1011 non-null object
pay_type            1011 non-null object
posting_date        1093 non-null object
closing_date        1093 non-null object
dtypes: object(16)
memory usage: 136.7+ KB


In [5]:
jobs.sample(5)

Unnamed: 0,job_id,job_title,company_name,company_address,job_category,job_url,job_description,job_requirements,employment_type,seniority,job_skills,pay_min,pay_max,pay_type,posting_date,closing_date
653,JOB-2019-0087098,"Associate, Financial Crime Analytics – Transac...",DBS BANK LTD.,"MARINA BAY FINANCIAL CENTRE, 12 MARINA BOULEVA...",Banking and Finance,https://www.mycareersfuture.sg/job/associate-f...,"Group Legal, Compliance & Secretariat ensur...",Bachelor Degree or any equivalent work experie...,"Permanent, Full Time",Senior Executive,"['Business Development', 'Business Strategy', ...","$3,000","$6,000",Monthly,24 Apr 2019,24 May 2019
767,JOB-2019-0085156,"Pre-sales Consultant - Big Data (5 days, Orcha...",MACHSPEED HUMAN RESOURCES PTE. LTD.,"GOLDEN WALL CENTRE, 89 SHORT STREET 188216",Information Technology,https://www.mycareersfuture.sg/job/pre-sales-c...,Job Responsibilities,"Minimum Degree/Diploma in Computer Science, En...",Permanent,Professional,"['Business Analysis', 'Business Development', ...","$4,000","$5,000",Monthly,22 Apr 2019,22 May 2019
762,JOB-2019-0093135,Senior Consultant - Data Migration - AX Dynamics,HCL SINGAPORE PTE. LTD.,"AXA TOWER, 8 SHENTON WAY 068811",Information Technology,https://www.mycareersfuture.sg/job/senior-cons...,Responsibilities :,5+ years of work experience with minimum 3+ye...,Permanent,Senior Executive,"['Analysis', 'Business Analysis', 'Business De...","$5,500","$8,000",Monthly,02 May 2019,01 Jun 2019
181,JOB-2019-0076667,Senior / Data Engineer (Data Science Team),M1 LIMITED,10 INTERNATIONAL BUSINESS PARK 609928,"Engineering, Information Technology, Others, T...",https://www.mycareersfuture.sg/job/senior-data...,,Bachelor’s degree in Computer Science/Engineer...,Full Time,"Fresh/entry level, Executive, Senior Executive","['Business Analysis', 'Business Intelligence',...","$3,500","$5,800",Monthly,10 Apr 2019,10 May 2019
106,JOB-2019-0092968,"AVP, Data Center Manager, Chief Technology Org...",ALEXANDER MANN BPO SOLUTIONS (SINGAPORE) PTE. ...,"SGX CENTRE I, 2 SHENTON WAY 068804","Banking and Finance, Information Technology",https://www.mycareersfuture.sg/job/avp-data-ce...,Our purpose as a firm is to make financial liv...,Minimum of 5 years’ experience in a technology...,"Permanent, Full Time",Professional,"['Cabling', 'Cisco Technologies', 'Cloud Compu...","$8,000","$16,000",Monthly,02 May 2019,01 Jun 2019


In [6]:
# as we will be predicting pay with the job description as target,
# items with misisng information (e.g. pay) is not very useful


# remove items without pay (missing target)
jobs.dropna(subset=['pay_min','pay_max','pay_type'], inplace=True)

# compute the average pay to be used as our target as there is min/max pay
jobs['pay_min'] = jobs['pay_min'].map(lambda x: int(x.replace('$','').replace(',','')))
jobs['pay_max'] = jobs['pay_max'].map(lambda x: int(x.replace('$','').replace(',','')))
jobs['avg_pay'] = 0.5 * (jobs['pay_min'] + jobs['pay_max'])

# as we will be predicting monthly pay, we will convert any annual pay into monthly pay
index = jobs[jobs['pay_type']=='Annually'].index 
for line in index:
    jobs.loc[line,'avg_pay'] = int(round(jobs.loc[line,'avg_pay']/12.,0))
jobs['avg_pay'] = jobs['avg_pay'].astype(int)   

In [7]:
# convert the 'employment_type' into dummies, remove original column
jobs['emp_fulltime'] = jobs['employment_type'].map(lambda x: 1 if 'Full Time' in x else 0)
jobs['emp_permanent'] = jobs['employment_type'].map(lambda x: 1 if 'Permanent' in x else 0)
jobs['emp_contract'] = jobs['employment_type'].map(lambda x: 1 if 'Contract' in x else 0)
jobs['emp_temporary'] = jobs['employment_type'].map(lambda x: 1 if 'Temporary' in x else 0)
jobs['emp_freelance'] = jobs['employment_type'].map(lambda x: 1 if 'Freelance' in x else 0)
jobs['emp_internship'] = jobs['employment_type'].map(lambda x: 1 if 'Internship' in x else 0)
jobs['emp_parttime'] = jobs['employment_type'].map(lambda x: 1 if 'Part Time' in x else 0)

jobs.drop(columns='employment_type', inplace=True)

# convert the 'seniority' into dummies, remove original column
jobs['sen_nonexecutive'] = jobs['seniority'].map(lambda x: 1 if 'Non-executive' in x else 0)
jobs['sen_juniorexecutive'] = jobs['seniority'].map(lambda x: 1 if 'Junior Executive' in x else 0)
jobs['sen_executive'] = jobs['seniority'].map(lambda x: 1 if 'Executive' in x else 0)
jobs['sen_seniorexecutive'] = jobs['seniority'].map(lambda x: 1 if 'Senior Executive' in x else 0)
jobs['sen_freshentrylevel'] = jobs['seniority'].map(lambda x: 1 if 'Fresh/entry level' in x else 0)
jobs['sen_professional'] = jobs['seniority'].map(lambda x: 1 if 'Professional' in x else 0)
jobs['sen_manager'] = jobs['seniority'].map(lambda x: 1 if 'Manager' in x else 0)
jobs['sen_middlemanagement'] = jobs['seniority'].map(lambda x: 1 if 'Middle Management' in x else 0)
jobs['sen_seniormanagement'] = jobs['seniority'].map(lambda x: 1 if 'Senior Management' in x else 0)

jobs.drop(columns='seniority', inplace=True)

# convert the 'job_category' into dummies, remove original column
jobs['cat_informationtechnology'] = jobs['job_category'].map(lambda x: 1 if 'Information Technology' in x else 0)
jobs['cat_engineering'] = jobs['job_category'].map(lambda x: 1 if 'Engineering' in x else 0)
jobs['cat_bankingandfinance'] = jobs['job_category'].map(lambda x: 1 if 'Banking and Finance' in x else 0)
jobs['cat_adminsecretarial'] = jobs['job_category'].map(lambda x: 1 if 'Admin / Secretarial' in x else 0)
jobs['cat_advertisingmedia'] = jobs['job_category'].map(lambda x: 1 if 'Advertising / Media' in x else 0)
jobs['cat_consulting'] = jobs['job_category'].map(lambda x: 1 if 'Consulting' in x else 0)
jobs['cat_logisticssupplychain'] = jobs['job_category'].map(lambda x: 1 if 'Logistics / Supply Chain' in x else 0)
jobs['cat_others'] = jobs['job_category'].map(lambda x: 1 if 'Others' in x else 0)
jobs['cat_insurance'] = jobs['job_category'].map(lambda x: 1 if 'Insurance' in x else 0)
jobs['cat_generalmanagement'] = jobs['job_category'].map(lambda x: 1 if 'General Management' in x else 0)
jobs['cat_scienceslaboratoryrd'] = jobs['job_category'].map(lambda x: 1 if 'Sciences / Laboratory / R&D' in x else 0)
jobs['cat_professionalservices'] = jobs['job_category'].map(lambda x: 1 if 'Professional Services' in x else 0)
jobs['cat_salesretail'] = jobs['job_category'].map(lambda x: 1 if 'Sales / Retail' in x else 0)
jobs['cat_customerservice'] = jobs['job_category'].map(lambda x: 1 if 'Customer Service' in x else 0)
jobs['cat_educationandtraining'] = jobs['job_category'].map(lambda x: 1 if 'Education and Training' in x else 0)
jobs['cat_publiccivilservice'] = jobs['job_category'].map(lambda x: 1 if 'Public / Civil Service' in x else 0)
jobs['cat_accountingauditingtaxation'] = jobs['job_category'].map(lambda x: 1 if 'Accounting / Auditing / Taxation' in x else 0)
jobs['cat_riskmanagement'] = jobs['job_category'].map(lambda x: 1 if 'Risk Management' in x else 0)
jobs['cat_buildingandconstruction'] = jobs['job_category'].map(lambda x: 1 if 'Building and Construction' in x else 0)
jobs['cat_healthcarepharmaceutical'] = jobs['job_category'].map(lambda x: 1 if 'Healthcare / Pharmaceutical' in x else 0)
jobs['cat_repairandmaintenance'] = jobs['job_category'].map(lambda x: 1 if 'Repair and Maintenance' in x else 0)
jobs['cat_manufacturing'] = jobs['job_category'].map(lambda x: 1 if 'Manufacturing' in x else 0)
jobs['cat_design'] = jobs['job_category'].map(lambda x: 1 if 'Design' in x else 0)
jobs['cat_telecommunications'] = jobs['job_category'].map(lambda x: 1 if 'Telecommunications' in x else 0)
jobs['cat_marketingpublicrelations'] = jobs['job_category'].map(lambda x: 1 if 'Marketing / Public Relations' in x else 0)
jobs['cat_humanresources'] = jobs['job_category'].map(lambda x: 1 if 'Human Resources' in x else 0)
jobs['cat_hospitality'] = jobs['job_category'].map(lambda x: 1 if 'Hospitality' in x else 0)
jobs['cat_fb'] = jobs['job_category'].map(lambda x: 1 if 'F&B' in x else 0)
jobs['cat_legal'] = jobs['job_category'].map(lambda x: 1 if 'Legal' in x else 0)
jobs['cat_environmenthealth'] = jobs['job_category'].map(lambda x: 1 if 'Environment / Health' in x else 0)
jobs['cat_architectureinteriordesign'] = jobs['job_category'].map(lambda x: 1 if 'Architecture / Interior Design' in x else 0)
jobs['cat_traveltourism'] = jobs['job_category'].map(lambda x: 1 if 'Travel / Tourism' in x else 0)
jobs['cat_generalwork'] = jobs['job_category'].map(lambda x: 1 if 'General Work' in x else 0)
jobs['cat_purchasingmerchandising'] = jobs['job_category'].map(lambda x: 1 if 'Purchasing / Merchandising' in x else 0)

jobs.drop(columns='job_category', inplace=True)

# process the job_skills column -> change from text into a list of skills
jobs['job_skillslist'] = jobs['job_skills'].map(lambda x: x[1:-1].replace("'",'').replace(' ','').lower().split(','))
# get dummy for the skills

cvect = CountVectorizer(stop_words='english')
cvect.fit(jobs['job_skills'])
X_train = pd.DataFrame(cvect.transform(jobs['job_skills']).todense(),
                       columns=cvect.get_feature_names())
word_counts = X_train.sum(axis=0)
skills = cvect.get_feature_names()
# word_counts.sort_values(ascending = False).head(20)

jobs.reset_index(drop=True, inplace=True)
for row in range(len(jobs)):
    for word in skills:
        if word in jobs['job_skillslist'][row]:
            jobs.loc[row,'skill_{}'.format(word)] = 1
        else:
            jobs.loc[row,'skill_{}'.format(word)] = 0

In [8]:
jobs['emp_internship'].value_counts()

0    1006
1       5
Name: emp_internship, dtype: int64

In [9]:
jobs['emp_temporary'].value_counts()

0    1008
1       3
Name: emp_temporary, dtype: int64

In [10]:
jobs['emp_freelance'].value_counts()

0    1010
1       1
Name: emp_freelance, dtype: int64

In [11]:
jobs['emp_parttime'].value_counts()

0    1010
1       1
Name: emp_parttime, dtype: int64

In [12]:
# internship, temporary and freelance will affect the pay estimation
# since occurance are only a few (1-5) of 1055 entries, 
# remove the records with flags 'emp_internship', 'emp_temporary', 'emp_freelance'

remove_list = jobs[jobs['emp_internship']==1].index\
            .append(jobs[jobs['emp_temporary']==1].index)\
            .append(jobs[jobs['emp_freelance']==1].index)\
            .append(jobs[jobs['emp_parttime']==1].index)
jobs.drop(index = remove_list, inplace=True)
jobs.drop(columns = ['emp_internship','emp_temporary','emp_freelance','emp_parttime'], inplace=True)
jobs.reset_index(drop=True, inplace=True)

In [13]:
jobs.columns.values

array(['job_id', 'job_title', 'company_name', 'company_address',
       'job_url', 'job_description', 'job_requirements', 'job_skills',
       'pay_min', 'pay_max', 'pay_type', 'posting_date', 'closing_date',
       'avg_pay', 'emp_fulltime', 'emp_permanent', 'emp_contract',
       'sen_nonexecutive', 'sen_juniorexecutive', 'sen_executive',
       'sen_seniorexecutive', 'sen_freshentrylevel', 'sen_professional',
       'sen_manager', 'sen_middlemanagement', 'sen_seniormanagement',
       'cat_informationtechnology', 'cat_engineering',
       'cat_bankingandfinance', 'cat_adminsecretarial',
       'cat_advertisingmedia', 'cat_consulting',
       'cat_logisticssupplychain', 'cat_others', 'cat_insurance',
       'cat_generalmanagement', 'cat_scienceslaboratoryrd',
       'cat_professionalservices', 'cat_salesretail',
       'cat_customerservice', 'cat_educationandtraining',
       'cat_publiccivilservice', 'cat_accountingauditingtaxation',
       'cat_riskmanagement', 'cat_buildingandcons

In [14]:
########################################################
# END OF DATA CLEANING
########################################################

In [15]:
approach_results = pd.DataFrame()

In [16]:
########################################################
# Approach 1: Use employment type, seniority and 
# category to predict salary
########################################################

In [17]:
# create predictors and target

cols = [x for x in jobs.columns.values if ('emp_' in x) | ('sen_' in x) | ('cat_' in x)]
X1 = jobs[cols]
y1 = jobs['avg_pay']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=42)

logreg1 = LogisticRegression(solver='lbfgs', multi_class='auto').fit(X1_train, y1_train)
y1_pred = logreg1.predict(X1_test)
y1_error = y1_pred - y1_test
highlow = ['higher' if np.mean(y1_error)>0 else 'lower'][0]

print('Estimated is on average ${} {} than actual.\nMax: ${}, Min: ${}, std: ${}'.format(
    round(np.abs(np.mean(y1_error)),2), highlow, np.max(y1_error),
    np.min(y1_error), round(np.std(y1_error),2)))
print('Accuracy: {:.6f}'.format(metrics.accuracy_score(y1_test, y1_pred)) )

approach_results = approach_results.append([[1, round(metrics.accuracy_score(y1_test, y1_pred),6), 
                         round(np.abs(np.mean(y1_error)),2), np.max(y1_error),
                         np.min(y1_error), round(np.std(y1_error),2)]])

Estimated is on average $740.49 lower than actual.
Max: $11000, Min: $-15000, std: $3483.65
Accuracy: 0.112957


In [18]:
########################################################
# Approach 2: Group the data according to low/med/high 
# salary, then use employment type, seniority and 
# category to predict salary, use classification
########################################################

In [19]:
# create predictors and target

cols_x = [x for x in jobs.columns.values if ('emp_' in x) | ('sen_' in x) | ('cat_' in x)]
cols_xy = [x for x in jobs.columns.values if ('emp_' in x) | ('sen_' in x) | ('cat_' in x)]
cols_xy.append('avg_pay')

data2 = jobs[cols_xy]
data2['pay_cat'] = pd.qcut(jobs['avg_pay'], 3, labels=['low','medium','high'])

# create the predictor and target matrix
y2 = data2['avg_pay']
X2 = data2.drop(columns='avg_pay')
# split them into 3 datasets according to the pay category
data2_low = data2[data2['pay_cat']=='low'].drop(columns='pay_cat')
data2_med = data2[data2['pay_cat']=='medium'].drop(columns='pay_cat')
data2_high = data2[data2['pay_cat']=='high'].drop(columns='pay_cat')

## low pay 
# create predictor and target matrix
X2_low = data2_low.iloc[:,:-2]
y2_low = data2_low.iloc[:,-1]
# create training set and test set
X2l_train, X2l_test, y2l_train, y2l_test = train_test_split(X2_low, y2_low, test_size = 0.3, random_state = 42)
# fit train data with a logistic regression model
logreg2_low = LogisticRegression(solver='lbfgs', multi_class='auto').fit(X2l_train, y2l_train)
# predict the test dataset
y2l_pred = logreg2_low.predict(X2l_test)
# find out the error
y2l_error = y2l_pred - y2l_test

highlow = ['higher' if np.mean(y2l_error)>0 else 'lower'][0]

print('Estimated for LOW is on average ${:.2f} {} than actual.\nMax: ${:.2f}, Min: ${:.2f}, std: ${:.2f}'.format(
    round(np.abs(np.mean(y2l_error)),2), highlow, np.max(y2l_error),
    np.min(y2l_error), round(np.std(y2l_error),2)))
print('Accuracy: {:.6f}'.format(metrics.accuracy_score(y2l_test, y2l_pred)) )

## medium pay
# create predictor and target matrix
X2_med = data2_med.iloc[:,:-2]
y2_med = data2_med.iloc[:,-1]
# create training set and test set
X2m_train, X2m_test, y2m_train, y2m_test = train_test_split(X2_med, y2_med, test_size = 0.3, random_state = 42)
# fit train data with a logistic regression model
logreg2_med = LogisticRegression(solver='lbfgs', multi_class='auto').fit(X2m_train, y2m_train)
# predict the test dataset
y2m_pred = logreg2_med.predict(X2m_test)
# find out the error
y2m_error = y2m_pred - y2m_test

highlow = ['higher' if np.mean(y2m_error)>0 else 'lower'][0]
print('-'*20)
print('Estimated for MED is on average ${:.2f} {} than actual.\nMax: ${:.2f}, Min: ${:.2f}, std: ${:.2f}'.format(
    round(np.abs(np.mean(y2m_error)),2), highlow, np.max(y2m_error),
    np.min(y2m_error), round(np.std(y2m_error),2)))
print('Accuracy: {:.6f}'.format(metrics.accuracy_score(y2m_test, y2m_pred)) )

## high pay
# create predictor and target matrix
X2_high = data2_high.iloc[:,:-2]
y2_high = data2_high.iloc[:,-1]
# create training set and test set
X2h_train, X2h_test, y2h_train, y2h_test = train_test_split(X2_high, y2_high, test_size = 0.3, random_state = 42)
# fit train data with a logistic regression model
logreg2_high = LogisticRegression(solver='lbfgs', multi_class='auto').fit(X2h_train, y2h_train)
# predict the test dataset
y2h_pred = logreg2_high.predict(X2h_test)
# find out the error                          
y2h_error = y2h_pred - y2h_test

highlow = ['higher' if np.mean(y2h_error)>0 else 'lower'][0]
print('-'*20)
print('Estimated for HIGH is on average ${:.2f} {} than actual.\nMax: ${:.2f}, Min: ${:.2f}, std: ${:.2f}'.format(
    round(np.abs(np.mean(y2h_error)),2), highlow, np.max(y2h_error),
    np.min(y2h_error), round(np.std(y2h_error),2)))
print('Accuracy: {:.6f}'.format(metrics.accuracy_score(y2h_test, y2h_pred)) )

# find out the overall error: i.e. combine all errors - low med high
y2_test_results = pd.concat([pd.Series(y2l_test),
                             pd.concat([pd.Series(y2m_test),
                                        pd.Series(y2h_test)], axis = 0)
                             ], axis = 0)

y2_pred_results = pd.concat([pd.Series(y2l_pred),
                             pd.concat([pd.Series(y2m_pred),
                                        pd.Series(y2h_pred)], axis = 0)
                            ], axis = 0)  
y2_error = pd.concat([pd.concat([pd.DataFrame(y2l_error, columns=[0]), y2m_error], axis=0), y2h_error], axis=0)[0]

highlow = ['higher' if np.mean(y2_error)>0 else 'lower'][0]
print('-'*20)
print('Overall Estimated is on average ${:.2f} {} than actual.\nMax: ${:.2f}, Min: ${:.2f}, std: ${:.2f}'.format(
    round(np.abs(np.mean(y2_error)),2), highlow, np.max(y2_error),
    np.min(y2_error), round(np.std(y2_error),2)))
print('Accuracy: {:.6f}'.format(metrics.accuracy_score(y2_test_results, y2_pred_results)) )

approach_results = approach_results.append([[2, round(metrics.accuracy_score(y2_test_results, y2_pred_results),6), 
                         round(np.abs(np.mean(y2_error)),2), np.max(y2_error),
                         np.min(y2_error), round(np.std(y2_error),2)]])

Estimated for LOW is on average $31.41 higher than actual.
Max: $2500.00, Min: $-1750.00, std: $1018.85
Accuracy: 0.129630
--------------------
Estimated for MED is on average $62.81 higher than actual.
Max: $1750.00, Min: $-1500.00, std: $819.34
Accuracy: 0.212766
--------------------
Estimated for HIGH is on average $1627.24 lower than actual.
Max: $6250.00, Min: $-28500.00, std: $4718.63
Accuracy: 0.140000
--------------------
Overall Estimated is on average $808.35 lower than actual.
Max: $6250.00, Min: $-28500.00, std: $3537.75
Accuracy: 0.158940


In [20]:
########################################################
# Approach 3: Group the data according to low/med/high 
# salary, then use employment type, seniority and 
# category to predict salary, use regression
########################################################

In [21]:
# create predictors and target

cols_x = [x for x in jobs.columns.values if ('emp_' in x) | ('sen_' in x) | ('cat_' in x)]
cols_xy = [x for x in jobs.columns.values if ('emp_' in x) | ('sen_' in x) | ('cat_' in x)]
cols_xy.append('avg_pay')

data3 = jobs[cols_xy]
data3['pay_cat'] = pd.qcut(jobs['avg_pay'], 3, labels=['low','medium','high'])

# create the predictor and target matrix
y3 = data3['avg_pay']
X3 = data3.drop(columns='avg_pay')
# split them into 3 datasets according to the pay category
data3_low = data3[data3['pay_cat']=='low'].drop(columns='pay_cat')
data3_med = data3[data3['pay_cat']=='medium'].drop(columns='pay_cat')
data3_high = data3[data3['pay_cat']=='high'].drop(columns='pay_cat')

## low pay 
# create predictor and target matrix
X3_low = data3_low.iloc[:,:-2]
y3_low = data3_low.iloc[:,-1]
# create training set and test set
X3l_train, X3l_test, y3l_train, y3l_test = train_test_split(X3_low, y3_low, test_size = 0.3, random_state = 42)
# fit train data with a logistic regression model
linreg3_low = LinearRegression().fit(X3l_train, y3l_train)
# predict the test dataset
y3l_pred = linreg3_low.predict(X3l_test)
# find out the error
y3l_error = y3l_pred - y3l_test

highlow = ['higher' if np.mean(y3l_error)>0 else 'lower'][0]

print('Estimated for LOW is on average ${:.2f} {} than actual.\nMax: ${:.2f}, Min: ${:.2f}, std: ${:.2f}'.format(
    round(np.abs(np.mean(y3l_error)),2), highlow, np.max(y3l_error),
    np.min(y3l_error), round(np.std(y3l_error),2)))
print('R2: {:.6f}'.format(metrics.r2_score(y3l_test, y3l_pred)) )

## medium pay
# create predictor and target matrix
X3_med = data3_med.iloc[:,:-2]
y3_med = data3_med.iloc[:,-1]
# create training set and test set
X3m_train, X3m_test, y3m_train, y3m_test = train_test_split(X3_med, y3_med, test_size = 0.3, random_state = 42)
# fit train data with a logistic regression model
linreg3_med = LinearRegression().fit(X3m_train, y3m_train)
# predict the test dataset
y3m_pred = linreg3_med.predict(X3m_test)
# find out the error
y3m_error = y3m_pred - y3m_test

highlow = ['higher' if np.mean(y3m_error)>0 else 'lower'][0]
print('-'*20)
print('Estimated for MED is on average ${:.2f} {} than actual.\nMax: ${:.2f}, Min: ${:.2f}, std: ${:.2f}'.format(
    round(np.abs(np.mean(y3m_error)),2), highlow, np.max(y3m_error),
    np.min(y3m_error), round(np.std(y3m_error),2)))
print('R2: {:.6f}'.format(metrics.r2_score(y3m_test, y3m_pred)) )

## high pay
# create predictor and target matrix
X3_high = data3_high.iloc[:,:-2]
y3_high = data3_high.iloc[:,-1]
# create training set and test set
X3h_train, X3h_test, y3h_train, y3h_test = train_test_split(X3_high, y3_high, test_size = 0.3, random_state = 42)
# fit train data with a logistic regression model
linreg3_high = LinearRegression().fit(X3h_train, y3h_train)
# predict the test dataset
y3h_pred = linreg3_high.predict(X3h_test)
# find out the error
y3h_error = y3h_pred - y3h_test

highlow = ['higher' if np.mean(y3h_error)>0 else 'lower'][0]
print('-'*20)
print('Estimated for HIGH is on average ${:.2f} {} than actual.\nMax: ${:.2f}, Min: ${:.2f}, std: ${:.2f}'.format(
    round(np.abs(np.mean(y3h_error)),2), highlow, np.max(y3h_error),
    np.min(y3h_error), round(np.std(y3h_error),2)))
print('R2: {:.6f}'.format(metrics.r2_score(y3h_test, y3h_pred)) )

# find out the overall error: i.e. combine all errors - low med high
y3_test_results = pd.concat([pd.Series(y3l_test),
                             pd.concat([pd.Series(y3m_test),
                                        pd.Series(y3h_test)], axis = 0)
                             ], axis = 0)

y3_pred_results = pd.concat([pd.Series(y2l_pred),
                             pd.concat([pd.Series(y3m_pred),
                                        pd.Series(y3h_pred)], axis = 0)
                            ], axis = 0)  

y3_error = pd.concat([pd.concat([pd.DataFrame(y3l_error, columns=[0]), y3m_error], axis=0), y3h_error], axis=0)[0]


highlow = ['higher' if np.mean(y3_error)>0 else 'lower'][0]
print('-'*20)
print('Overall Estimated is on average ${:.2f} {} than actual.\nMax: ${:.2f}, Min: ${:.2f}, std: ${:.2f}'.format(
    round(np.abs(np.mean(y3_error)),2), highlow, np.max(y3_error),
    np.min(y3_error), round(np.std(y3_error),2)))
print('R2: {:.6f}'.format(metrics.r2_score(y3_test_results, y3_pred_results)) )

approach_results = approach_results.append([[3, round(metrics.r2_score(y3_test_results, y3_pred_results),6), 
                         round(np.abs(np.mean(y3_error)),2), round(np.max(y3_error),2),
                         round(np.min(y3_error),2), round(np.std(y3_error),2)]])

Estimated for LOW is on average $27.05 lower than actual.
Max: $2427.75, Min: $-1877.60, std: $848.75
R2: 0.019657
--------------------
Estimated for MED is on average $59.29 higher than actual.
Max: $3053.47, Min: $-1749.86, std: $675.14
R2: -0.417074
--------------------
Estimated for HIGH is on average $820.97 lower than actual.
Max: $4714.61, Min: $-28673.52, std: $4220.75
R2: 0.060525
--------------------
Overall Estimated is on average $394.45 lower than actual.
Max: $4714.61, Min: $-28673.52, std: $3097.94
R2: 0.604061


In [22]:
########################################################
# Approach 4: Use the job requirements data to determine 
# salary, using the CountVectorizer with MultinomialNB
########################################################

In [23]:
# copy out the dataset, drop those rows with empty job requirements

X4 = jobs.dropna(subset = ['job_requirements'])
print('{} fields were removed. current dataset contains {} rows'.format(
        len(jobs)-len(X4), len(X4)))
# create predictor and target matrix
y4 = X4['avg_pay']
X4 = X4['job_requirements']

X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.3, random_state=42)

# find out the best parameters
score = []
for min_d in range(1,50):
    
    cvect = CountVectorizer(stop_words='english', 
                ngram_range=(1,2), 
                max_features=10000,
                min_df=min_d)
    # create document-term matrices
    X4_train_dtm = cvect.fit_transform(X4_train)
    X4_test_dtm  = cvect.transform(X4_test)

    # use MultinomialNB
    mnb = MultinomialNB()
    mnb.fit(X4_train_dtm, y4_train)
    y4_pred = mnb.predict(X4_test_dtm)
    y4_error = y4_pred - y4_test  
    
    score.append([min_d, round(metrics.accuracy_score(y4_test, y4_pred),6)])

score = pd.DataFrame(score)
score.columns = ['min_df','R2_score']

best_df = score.loc[score[score['R2_score'] == score['R2_score'].max()].index[0], 'min_df']
best_score = score['R2_score'].max()
print('-'*20)
print('Best min_df is {} with a R2_score of {}'.format(best_df, best_score))

## predict using the best parameters

# create model
model4 = make_pipeline( CountVectorizer(stop_words='english', 
                        ngram_range=(1,10), 
                        max_features=1000,
                        min_df=best_df),
                      
                        MultinomialNB()
                      )
# fit & predict data
model4.fit(X4_train, y4_train)
y4_pred = model4.predict(X4_test)
y4_error = y4_pred - y4_test

# present results
highlow = ['higher' if np.mean(y4_error)>0 else 'lower'][0]
print('-'*20)
print('Overall Estimated is on average ${:.2f} {} than actual.\nMax: ${:.2f}, Min: ${:.2f}, std: ${:.2f}'.format(
    round(np.abs(np.mean(y4_error)),2), highlow, np.max(y4_error),
    np.min(y4_error), round(np.std(y4_error),2)))
print('Accuracy: {:.6f}'.format(metrics.accuracy_score(y4_test, y4_pred)) )

approach_results = approach_results.append([[4, round(metrics.accuracy_score(y4_test, y4_pred),6), 
                         round(np.abs(np.mean(y4_error)),2), np.max(y4_error),
                         np.min(y4_error), round(np.std(y4_error),2)]])

275 fields were removed. current dataset contains 727 rows
--------------------
Best min_df is 5 with a R2_score of 0.159817
--------------------
Overall Estimated is on average $740.33 lower than actual.
Max: $11000.00, Min: $-26000.00, std: $3916.36
Accuracy: 0.127854


In [24]:
########################################################
# Approach 5: Use the job requirements data to determine 
# salary, using the CountVectorizer with LogisticRegression
########################################################

In [25]:
# copy out the dataset, drop those rows with empty job requirements

X5 = jobs.dropna(subset = ['job_requirements'])
print('{} fields were removed. current dataset contains {} rows'.format(
        len(jobs)-len(X5), len(X5)))
# create predictor and target matrix
y5 = X5['avg_pay']
X5 = X5['job_requirements']

X5_train, X5_test, y5_train, y5_test = train_test_split(X5, y5, test_size=0.3, random_state=42)

# find out the best parameters
score = []
for min_d in range(1,50):
    
    cvect = CountVectorizer(stop_words='english', 
                ngram_range=(1,2), 
                max_features=10000,
                min_df=min_d)
    # create document-term matrices
    X5_train_dtm = cvect.fit_transform(X5_train)
    X5_test_dtm  = cvect.transform(X5_test)

    # use MultinomialNB
    mnb = MultinomialNB().fit(X5_train_dtm, y5_train)
    y5_pred = mnb.predict(X5_test_dtm)
    y5_error = y5_pred - y5_test  
    
    score.append([min_d, round(metrics.accuracy_score(y5_test, y5_pred),6)])

score = pd.DataFrame(score)
score.columns = ['min_df','R2_score']

best_df = score.loc[score[score['R2_score'] == score['R2_score'].max()].index[0], 'min_df']
best_score = score['R2_score'].max()
print('-'*20)
print('Best min_df is {} with a R2_score of {}'.format(best_df, best_score))

## predict using the best parameters

# create model
model5 = make_pipeline( CountVectorizer(stop_words='english', 
                        ngram_range=(1,10), 
                        max_features=1000,
                        min_df=best_df),
                      
                        LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
                      )
# fit & predict data
model5.fit(X5_train, y5_train)
y5_pred = model5.predict(X5_test)
y5_error = y5_pred - y5_test

# present results
highlow = ['higher' if np.mean(y5_error)>0 else 'lower'][0]
print('-'*20)
print('Overall Estimated is on average ${:.2f} {} than actual.\nMax: ${:.2f}, Min: ${:.2f}, std: ${:.2f}'.format(
    round(np.abs(np.mean(y5_error)),2), highlow, np.max(y5_error),
    np.min(y5_error), round(np.std(y5_error),2)))
print('Accuracy: {:.6f}'.format(metrics.accuracy_score(y5_test, y5_pred)) )

approach_results = approach_results.append([[5, round(metrics.accuracy_score(y5_test, y5_pred),6), 
                         round(np.abs(np.mean(y5_error)),2), np.max(y5_error),
                         np.min(y5_error), round(np.std(y5_error),2)]])

275 fields were removed. current dataset contains 727 rows
--------------------
Best min_df is 5 with a R2_score of 0.159817
--------------------
Overall Estimated is on average $1067.73 lower than actual.
Max: $5450.00, Min: $-26000.00, std: $3550.84
Accuracy: 0.155251


In [26]:
approach_results.columns = ['approach','score','mean_error','min_error','max_error','std']
approach_results.reset_index(drop=True, inplace=True)
approach_results

Unnamed: 0,approach,score,mean_error,min_error,max_error,std
0,1,0.112957,740.49,11000.0,-15000.0,3483.65
1,2,0.15894,808.35,6250.0,-28500.0,3537.75
2,3,0.604061,394.45,4714.61,-28673.52,3097.94
3,4,0.127854,740.33,11000.0,-26000.0,3916.36
4,5,0.155251,1067.73,5450.0,-26000.0,3550.84


In [27]:
best_approach_index = approach_results[approach_results['score']==approach_results['score'].max()].index[0]
print('The best approach used is approach {} with an accuracy score of {}. \nMean Error: {}, Min Error: {}, Max Error: {}, std: {}'.format(
   approach_results.approach[best_approach_index], approach_results['score'][best_approach_index],
   approach_results.mean_error[best_approach_index], approach_results.min_error[best_approach_index],
   approach_results.max_error[best_approach_index], approach_results['std'][best_approach_index]))

The best approach used is approach 3 with an accuracy score of 0.604061. 
Mean Error: 394.45, Min Error: 4714.61, Max Error: -28673.52, std: 3097.94


In [28]:
results1 = pd.DataFrame([linreg3_low.coef_])
results1.columns = X3l_train.columns.values
results1 = results1.transpose()
results1.columns = ['coef']
results1['abs_coef'] = np.abs(results1['coef'])
results1['weightage'] = results1.coef / np.sum(results1.coef)
results1['abs_weightage'] = results1.abs_coef / np.sum(results1.abs_coef)
# results.sort_values(by='abs_coef', ascending=False)
results1.sort_values(by='coef', ascending=False, inplace=True)

In [29]:
results1.shape

(45, 4)

In [30]:
print('Top 10 factors that affect pay (in order of importance): \n{}'.format(
    results1.head(10).transpose().columns.values))

Top 10 factors that affect pay (in order of importance): 
['sen_seniormanagement' 'sen_middlemanagement' 'cat_bankingandfinance'
 'sen_manager' 'cat_consulting' 'sen_seniorexecutive' 'cat_engineering'
 'cat_design' 'sen_professional' 'sen_nonexecutive']


In [31]:
results1.head(10)

Unnamed: 0,coef,abs_coef,weightage,abs_weightage
sen_seniormanagement,1094.718253,1094.718253,-0.118781,0.051477
sen_middlemanagement,955.977089,955.977089,-0.103727,0.044953
cat_bankingandfinance,749.909475,749.909475,-0.081368,0.035263
sen_manager,604.60781,604.60781,-0.065602,0.028431
cat_consulting,445.405145,445.405145,-0.048328,0.020944
sen_seniorexecutive,311.872514,311.872514,-0.033839,0.014665
cat_engineering,304.0225,304.0225,-0.032987,0.014296
cat_design,257.529361,257.529361,-0.027943,0.01211
sen_professional,242.419047,242.419047,-0.026303,0.011399
sen_nonexecutive,235.601586,235.601586,-0.025564,0.011079


In [32]:
##############################################################
# section 3: determine factors that distinguish job category
##############################################################

In [33]:
##############################################################
# section 3a: predicting components of a job posting that 
# distinguish data scientist from other data jobs
##############################################################

In [34]:
# extract the "data scientist" index and the non 'data scientist' index
jobs.reset_index(drop=True, inplace=True)
sci_index = jobs[jobs['job_title'].str.contains('Scientist') &
                 jobs['job_title'].str.contains('Data')].index
non_sci_index = [i for i in jobs.index if i not in sci_index]

# copy out the dataframe
df3a = jobs.copy()
df3a['is_ds'] = 0
df3a.loc[sci_index, 'is_ds'] = 1

# as the data is inbalanced, we extract an equal number of samples from each class
np.random.seed(42)
df3a_index = np.append(np.random.choice(sci_index,size=500, replace=True), 
                       np.random.choice(non_sci_index, size=500, replace=True))
# create the balanced dataframe
df3a = df3a.loc[df3a_index,:].reset_index(drop=True)

In [35]:
# extract out the columns - skills, and is_ds
cols = [i for i in df3a.columns if ( ('skill_' in i) )]
# create predictor and target matrix, training set and testing set
X3a = df3a[cols]
y3a = df3a['is_ds']
X3a_train, X3a_test, y3a_train, y3a_test = train_test_split(X3a, y3a, test_size=0.2, random_state=42)
# use logistic regression to predict
logreg3a = LogisticRegression(solver='lbfgs').fit(X3a_train, y3a_train)
y3a_pred = logreg3a.predict(X3a_test)
# get accuracy of prediction
print('Accuracy: {:.6f}'.format(metrics.accuracy_score(y3a_test, y3a_pred)) )

# get the top 10 impactful columns
results_3a = pd.DataFrame([logreg3a.coef_[0]], columns=cols).transpose()
null_cols = results_3a[results_3a[0]==0].index
results_3a = results_3a.transpose()
results_3a.drop(columns=null_cols, inplace=True)

results_3a = results_3a.transpose()
results_3a.columns = ['coef']
results_3a['abs_coef'] = np.abs(results_3a.coef)
results_3a['weightage'] = results_3a.coef / np.sum(results_3a.coef)
results_3a['abs_weightage'] = results_3a.abs_coef / np.sum(results_3a.abs_coef)
# results_3a.sort_values(by='abs_weightage', ascending = False, inplace = True )
results_3a.sort_values(by='weightage', ascending = False, inplace = True)
num_results = 10
# print(results_3a.head(num_results))

print('The top {} characteristics that makes Data Scientist different from others are: \n{}'.format(
      num_results, results_3a.head(num_results).index.values))

Accuracy: 0.840000
The top 10 characteristics that makes Data Scientist different from others are: 
['skill_mpi' 'skill_crm' 'skill_spss' 'skill_ssis' 'skill_strategy'
 'skill_telecommunications' 'skill_html' 'skill_gcp' 'skill_edc'
 'skill_valuation']


In [36]:
results_3a.head(10)

Unnamed: 0,coef,abs_coef,weightage,abs_weightage
skill_mpi,3.088211,3.088211,0.643639,0.039287
skill_crm,1.63451,1.63451,0.340661,0.020794
skill_spss,1.579409,1.579409,0.329178,0.020093
skill_ssis,1.50009,1.50009,0.312646,0.019084
skill_strategy,1.320348,1.320348,0.275185,0.016797
skill_telecommunications,1.184688,1.184688,0.246911,0.015071
skill_html,1.103462,1.103462,0.229982,0.014038
skill_gcp,1.103362,1.103362,0.229961,0.014037
skill_edc,1.103362,1.103362,0.229961,0.014037
skill_valuation,1.032028,1.032028,0.215093,0.013129


In [37]:
##############################################################
# section 3b: what features are important for distinguishing
# junior vs senior positions
##############################################################

In [38]:
sen_cols = [i for i in jobs.columns if 'sen_' in i]
print('{} rows with 1 seniority type'.format(len(jobs[jobs[sen_cols].sum(axis=1)==1])))
print('{} rows with 2 seniority type'.format(len(jobs[jobs[sen_cols].sum(axis=1)==2])))
print('{} rows with 3 seniority type'.format(len(jobs[jobs[sen_cols].sum(axis=1)==3])))
print('{} rows with 4 seniority type'.format(len(jobs[jobs[sen_cols].sum(axis=1)==4])))
print('{} rows with 5 seniority type'.format(len(jobs[jobs[sen_cols].sum(axis=1)==5])))
print('{} rows with 6 seniority type'.format(len(jobs[jobs[sen_cols].sum(axis=1)==6])))

777 rows with 1 seniority type
174 rows with 2 seniority type
36 rows with 3 seniority type
13 rows with 4 seniority type
2 rows with 5 seniority type
0 rows with 6 seniority type


In [39]:
# for ease of classification, we only consider jobs with only 1 seniority type
ind = jobs[jobs[sen_cols].sum(axis=1)==1].index
df3b = jobs.loc[ind,:]
df3b.reset_index(drop=True, inplace=True)
df3b['job_seniority']=0
for index in range(len(df3b)):
    df3b.loc[index,'job_seniority'] = 'junior' if df3b.loc[index,'sen_nonexecutive'] == 1 else (
                                  'junior' if df3b.loc[index,'sen_juniorexecutive'] == 1 else (
                                  'junior' if df3b.loc[index,'sen_executive'] == 1 else (
                                  'junior' if df3b.loc[index,'sen_seniorexecutive'] == 1 else (
                                  'junior' if df3b.loc[index,'sen_freshentrylevel'] == 1 else (
                                  'junior' if df3b.loc[index,'sen_professional'] == 1 else (
                                  'senior' if df3b.loc[index,'sen_manager'] == 1 else (
                                  'senior' if df3b.loc[index,'sen_middlemanagement'] == 1 else (
                                  'senior' if df3b.loc[index,'sen_seniormanagement'] == 1 else (0
                                  )))))))))

print('baseline accuracy: {:.6f}'.format(np.max(df3b.job_seniority.value_counts())/len(df3b)))

baseline accuracy: 0.703990


In [40]:
# using skills required to predict job seniority
cols = [i for i in df3a.columns if ( ('skill_' in i) )]
X3b = df3b[cols]
y3b = df3b['job_seniority']
X3b_train, X3b_test, y3b_train, y3b_test = train_test_split(X3b, y3b, test_size=0.2, random_state=42)
# use logistic regression to predict
logreg3b = LogisticRegression(solver='lbfgs', multi_class='auto').fit(X3b_train, y3b_train)
y3b_pred = logreg3b.predict(X3b_test)
# get accuracy of prediction
print('Accuracy: {:.6f}'.format(metrics.accuracy_score(y3b_test, y3b_pred)) )
# score of 0.776 is better than baseline (0.704). model is useful!

Accuracy: 0.775641


In [41]:
# using job title to predict job seniority
X3b = df3b['job_title']
y3b = df3b['job_seniority']

X3b_train, X3b_test, y3b_train, y3b_test = train_test_split(X3b, y3b, test_size=0.3, random_state=42)

# create model
model3b = make_pipeline( CountVectorizer(stop_words='english', 
                        ngram_range=(1,5), 
                        max_features=1000,
                        min_df=1),
                      
                        MultinomialNB()
                      )
# fit & predict data
model3b.fit(X3b_train, y3b_train)
y3b_pred = model3b.predict(X3b_test)

# present results
print('Accuracy: {:.6f}'.format(metrics.accuracy_score(y3b_test, y3b_pred)) )
# score of 0.846 is better than previous model score (0.776). this model is better.

Accuracy: 0.846154


In [42]:
results_3b = pd.DataFrame([model3b.steps[1][1].coef_[0]],
                         columns = model3b.steps[0][1].get_feature_names()).transpose()

null_cols = results_3b[results_3b[0]==0].index
results_3b = results_3b.transpose()
results_3b.drop(columns = null_cols, inplace=True)

results_3b = results_3b.transpose()
results_3b.columns = ['coef']
results_3b['abs_coef'] = np.abs(results_3b.coef)
results_3b['weightage'] = results_3b.coef / np.sum(results_3b.coef)
results_3b['abs_weightage'] = results_3b.abs_coef / np.sum(results_3b.abs_coef)

# results_3b.sort_values(by='abs_weightage', ascending = False, inplace = True )
results_3b.sort_values(by='abs_weightage', ascending = False, inplace = True)
num_results = 30
# print(results_3b.head(num_results))


print('The top {} characteristics in a job title \
that makes Senior role different from others roles are: \n{}'.format(num_results,
                                                                     results_3b.head(num_results).index.values))

The top 30 characteristics in a job title that makes Senior role different from others roles are: 
['hadoop' 'principal business' 'principal infocomm'
 'principal infocomm specialist' 'principal infocomm specialist data'
 'principal infocomm specialist data analytics'
 'manager 3500 5500 days river' 'manager 3500 5500 days'
 'manager 3500 5500' 'manager 3500' 'principal software'
 'principal software engineering' 'principal software engineering lead'
 'processing' 'processing executive' 'procurement' 'machine learning'
 'machine' 'procurement business' 'procurement business intelligence'
 'procurement business intelligence 19001366' 'procurement executive'
 'level data scientist' 'level data engineer' 'level data' 'level'
 'lecturer' 'learning i2r star' 'learning i2r' 'learning engineer']


In [43]:
results_3b.head(10)

Unnamed: 0,coef,abs_coef,weightage,abs_weightage
hadoop,-7.749322,7.749322,0.001076,0.001076
principal business,-7.749322,7.749322,0.001076,0.001076
principal infocomm,-7.749322,7.749322,0.001076,0.001076
principal infocomm specialist,-7.749322,7.749322,0.001076,0.001076
principal infocomm specialist data,-7.749322,7.749322,0.001076,0.001076
principal infocomm specialist data analytics,-7.749322,7.749322,0.001076,0.001076
manager 3500 5500 days river,-7.749322,7.749322,0.001076,0.001076
manager 3500 5500 days,-7.749322,7.749322,0.001076,0.001076
manager 3500 5500,-7.749322,7.749322,0.001076,0.001076
manager 3500,-7.749322,7.749322,0.001076,0.001076


In [44]:
##############################################################
# section 3c: do the requirements for titles vary significantly
# with industry (e.g. healthcare vs government)
##############################################################

In [45]:
# extract the indices of rows with "manager" in job title
jobs.reset_index(drop=True, inplace=True)
manager_index = jobs[jobs['job_title'].str.contains('Manager')].index

# copy out the dataframe
df3c = jobs.loc[manager_index,:]

healthcare_index = df3c[df3c['cat_healthcarepharmaceutical']==1].index
gov_index = df3c[df3c['cat_publiccivilservice']==1].index

# as the data is inbalanced, we extract an equal number of samples from each class
np.random.seed(42)
df3c_index = np.append(np.random.choice(healthcare_index,size=500, replace=True), 
                       np.random.choice(gov_index, size=500, replace=True))
# create the balanced dataframe
df3c = df3c.loc[df3c_index,:].reset_index(drop=True)

# check if there is any jobs that falls under both category
if (df3c[['cat_publiccivilservice','cat_healthcarepharmaceutical']].sum(axis=1).value_counts()[1]) != 1000:
    print('ERROR: Data contains jobs with two classes')
    
print('baseline accuracy: {:.6f}'.format(np.max(df3c.cat_publiccivilservice.value_counts())/len(df3c)))

baseline accuracy: 0.500000


In [46]:
# compare the skill required for a manager in public service vs healthcare
# if the manager skills can determine if public service or healthcare,
# then the requirement of titles does differ

In [47]:
# create predictor and target matrix
skill_cols = [col for col in df3c.columns if 'skill_' in col]
X3c = df3c[skill_cols]
y3c = df3c['cat_publiccivilservice']

X3c_train, X3c_test, y3c_train, y3c_test = train_test_split(X3c, y3c, test_size=0.3, random_state = 42)

logreg3c = LogisticRegression(solver = 'lbfgs').fit(X3c_train, y3c_train)
y3c_pred = logreg3c.predict(X3c_test)
print('Accuracy: {:.6f}'.format(metrics.accuracy_score(y3c_test, y3c_pred)) )
# model score of 0.897 is better than baseline score of 0.5. model is good!

Accuracy: 0.896667


In [48]:
results_3c = pd.DataFrame([logreg3c.coef_[0]], columns=skill_cols).transpose()
null_cols = results_3c[results_3c[0]==0].index
results_3c = results_3c.transpose()
results_3c.drop(columns=null_cols, inplace=True)

results_3c = results_3c.transpose()
results_3c.columns = ['coef']
results_3c['abs_coef'] = np.abs(results_3c.coef)
results_3c['weightage'] = results_3c.coef / np.sum(results_3c.coef)
results_3c['abs_weightage'] = results_3c.abs_coef / np.sum(results_3c.abs_coef)
# results_3c.sort_values(by='abs_weightage', ascending = False, inplace = True )
results_3c.sort_values(by='weightage', ascending = False, inplace = True)
num_results = 10
# print(results_3c.head(num_results))

print('The top {} characteristics that makes manager working for the Government (Civil/Public Service) \
different from manager working for Healthcare are: \n{}'.format(
      num_results, results_3c.head(num_results).index.values))

The top 10 characteristics that makes manager working for the Government (Civil/Public Service) different from manager working for Healthcare are: 
['skill_budgets' 'skill_construction' 'skill_sql' 'skill_databases'
 'skill_edc' 'skill_research' 'skill_gcp' 'skill_access' 'skill_html'
 'skill_visio']


In [49]:
results_3c.head(num_results)

Unnamed: 0,coef,abs_coef,weightage,abs_weightage
skill_budgets,-1.325162,1.325162,0.228439,0.052966
skill_construction,-1.325162,1.325162,0.228439,0.052966
skill_sql,-1.247226,1.247226,0.215004,0.049851
skill_databases,-1.247226,1.247226,0.215004,0.049851
skill_edc,-1.197099,1.197099,0.206363,0.047848
skill_research,-1.197099,1.197099,0.206363,0.047848
skill_gcp,-1.197099,1.197099,0.206363,0.047848
skill_access,-0.946142,0.946142,0.163102,0.037817
skill_html,-0.946142,0.946142,0.163102,0.037817
skill_visio,-0.946142,0.946142,0.163102,0.037817
