In [2]:
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd
from scipy import stats
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException

from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.cross_validation import cross_val_predict, cross_val_score

from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

  from numpy.core.umath_tests import inner1d



## Business Case Overview

You're working as a data scientist for a contracting firm that's rapidly expanding. Now that they have their most valuable employee (you!), they need to leverage data to win more contracts. Your firm offers technology and scientific solutions and wants to be competitive in the hiring market. Your principal has two main objectives:

   1. Determine the industry factors that are most important in predicting the salary amounts for these data.
   2. Determine the factors that distinguish job categories and titles from each other. For example, can required skills accurately predict job title?

To limit the scope, your principal has suggested that you *focus on data-related job postings*, e.g. data scientist, data analyst, research scientist, business intelligence, and any others you might think of. You may also want to decrease the scope by *limiting your search to a single region.*

Hint: Aggregators like [Indeed.com](https://www.indeed.com) regularly pool job postings from a variety of markets and industries. 

**Goal:** Scrape your own data from a job aggregation tool like Indeed.com in order to collect the data to best answer these two questions.

---

In [None]:

job_titles = []
job_headers = []
job_contents = []
job_salary = []


for start in range(0, 1000, 10):
    driver = webdriver.Chrome("./chromedriver 2")
    webpage = "https://www.indeed.com.sg/jobs?q=data+scientist%2C+data+analyst&l=Singapore"+'&start='+str(start)
    
    
    driver.get(webpage)

    links = driver.find_elements_by_xpath("//div[contains(@class, 'row result clickcard')]")


    for link in links:
        job_searches = link.find_elements_by_tag_name('a')
        for job_search in job_searches:# Gettting the jobtitle
            if job_search.get_attribute('target') == '_blank' and job_search.get_attribute('class').find('jobtitle turnstileLink') != -1:
                if (len(job_search.text) != 0):
                    job_titles.append(job_search.text)

                        # Open a new window to load the job details                    
                    sec_driver = webdriver.Chrome("./chromedriver 2")
                    sec_driver.get(job_search.get_attribute('href'))


                        # Wait for the Page to Load
                    time.sleep(3)
                   
                    #Getting the company and location
                    header = sec_driver.find_element_by_class_name('jobsearch-InlineCompanyRating')
                    job_headers.append(header.text
                                       
                    #getting the job description and required skills
                    content = sec_driver.find_element_by_class_name('jobsearch-JobComponent-description')
                    job_contents.append(content.text)
                    
                    # Getting the salary if there is available    
                     try:
                        salary = sec_driver.find_element_by_class_name('jobsearch-JobMetadataHeader-item ')
                        job_salary.append(salary.text)
                    
                    except NoSuchElementException:  #spelling error making this code not work as expected
                        job_salary.append('No Salary listed')


                    sec_driver.close()
    result = pd.DataFrame(list(zip(job_titles, job_headers ,job_contents,job_salary)), columns = ['title', 'header', 'content', 'salaryrange'])

    
    #clicking the next page      
    next_page = driver.find_element_by_class_name('np')
    next_page.click()
    time.sleep(3)
    
    
   
    
    driver.close()


I have attempted to scrap Indeed.com for the US job pool at first, retrieve close to 5000 rows but after data cleaning, only 200 rows have salaries and considering that the data may not be a good indicator for predicting the Singapore job market, i decide to scape Singapore region.

However, while attempting to scrape singapore region using indeed.com, found out that the salaries are even lesser then the US region. Due to time constraint, i decided to use a dataset that is scrap through Career Future webpage.

### QUESTION 1: Factors that impact salary

To predict salary you will be building either a classification or regression model, using features like the location, title, and summary of the job. If framing this as a regression problem, you will be estimating the listed salary amounts. You may instead choose to frame this as a classification problem, in which case you will create labels from these salaries (high vs. low salary, for example) according to thresholds (such as median salary).

You have learned a variety of new skills and models that may be useful for this problem:
- NLP
- Unsupervised learning and dimensionality reduction techniques (PCA, clustering)
- Ensemble methods and decision tree models
- SVM models

Whatever you decide to use, the most important thing is to justify your choices and interpret your results. *Communication of your process is key.* Note that most listings **DO NOT** come with salary information. You'll need to able to extrapolate or predict the expected salaries for these listings.



After looking at the business problem, i have decided to build a classification model on the salary.
Steps for my approach
- Data cleaning
    - Remove duplicates 
    - split the salary range to low and high and create an Average column to represent the salary for each individual row
    - Create a High salary column which i will find the median of the Average column and if the salary is high 
      then median, i will classify 1 in High salary if not 0.
        
- Build a basic modelling using just 1 column(Industry).
    -Examine the accuracy score, look at the features importance 
- Build a second modelling using 2 columns(Industry and Seniority)
    - Does the accuracy score increase significantly?
    - What are the features that give a higher weights to model?
        
- Build a last model on 4 columns(Industry, Seniority, JD and Req)
    - Use Random forest and logistic regression 
    - Optimize the model using Grid search


In [3]:
df = pd.read_csv('jobsearch.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Company,Location,Range,Monthly/Annual,Seniority,JD,Req,Industry
0,0,"Business Analytics Manager, E-commerce (Google...",COMPANY UNDISCLOSED,undisclosed,undisclosed,,Manager,Manages a team of business analysts executives...,Requirements\r\nMin. Bachelor Degree in Busine...,Marketing / Public Relations
1,1,Manufacturing Data Specialist,ARCSTONE PTE. LTD.,"ALEXANDRA 38, 38 ALEXANDRA TERRACE 119932",undisclosed,,"Professional, Junior Executive, Senior Executive",Why Arcstone?\r\nArcstone is a fast-growing st...,Requirements\r\nWhat are the requirements of t...,"Consulting , Engineering, Information Technolo..."
2,2,Business Relationship Manager,ALGOMERCHANT PTE. LTD.,"PAYA LEBAR SQUARE, 60 PAYA LEBAR ROAD 409051","$2,500to$3,500",Monthly,Executive,You would be responsible to maintain and grow ...,Requirements\r\nBSc/Ba in Banking and Finance ...,"Banking and Finance, Education and Training, I..."
3,3,Data Analyst,SANDBOX CONSULTING PTE. LTD.,"TRIVEX, 8 BURN ROAD 369977","$6,000to$6,500",Monthly,Professional,Collaborate with stakeholders to understand th...,"Requirements\r\nUnderstand key concepts, techn...",Information Technology
4,4,"strategic planning and projects, director",BLUECHIP PLATFORMS ASIA PTE. LTD.,undisclosed,undisclosed,,Manager,Our client is a boutique Corporate Bank and th...,Requirements\r\n**Apply here**\r\nhttps://www....,"Banking and Finance, Information Technology"


In [4]:
# Convert all to lowercase to remove duplicates
df = df.apply(lambda x: x.astype(str).str.lower())


In [5]:
# drop duplicates if the following 4 subets are the same
df.drop_duplicates(subset=['Title','Company', 'Location','Range'], inplace=True)

In [7]:
df

Unnamed: 0.1,Unnamed: 0,Title,Company,Location,Range,Monthly/Annual,Seniority,JD,Req,Industry
0,0,"business analytics manager, e-commerce (google...",company undisclosed,undisclosed,undisclosed,,manager,manages a team of business analysts executives...,requirements\r\nmin. bachelor degree in busine...,marketing / public relations
1,1,manufacturing data specialist,arcstone pte. ltd.,"alexandra 38, 38 alexandra terrace 119932",undisclosed,,"professional, junior executive, senior executive",why arcstone?\r\narcstone is a fast-growing st...,requirements\r\nwhat are the requirements of t...,"consulting , engineering, information technolo..."
2,2,business relationship manager,algomerchant pte. ltd.,"paya lebar square, 60 paya lebar road 409051","$2,500to$3,500",monthly,executive,you would be responsible to maintain and grow ...,requirements\r\nbsc/ba in banking and finance ...,"banking and finance, education and training, i..."
3,3,data analyst,sandbox consulting pte. ltd.,"trivex, 8 burn road 369977","$6,000to$6,500",monthly,professional,collaborate with stakeholders to understand th...,"requirements\r\nunderstand key concepts, techn...",information technology
4,4,"strategic planning and projects, director",bluechip platforms asia pte. ltd.,undisclosed,undisclosed,,manager,our client is a boutique corporate bank and th...,requirements\r\n**apply here**\r\nhttps://www....,"banking and finance, information technology"
5,5,"senior product manager (analytics, ai products...",company undisclosed,undisclosed,"$5,000to$10,000",monthly,"middle management, manager, professional",- prepare product business case\r\n- work clos...,"requirements\r\n- bsc/ba in computer science, ...",information technology
6,6,data scientist,company undisclosed,undisclosed,"$6,000to$8,000",monthly,executive,collaborate with stakeholders to understand th...,"requirements\r\nunderstand key concepts, techn...",information technology
7,7,software consultant,company undisclosed,undisclosed,"$6,000to$8,500",monthly,senior executive,total experience of 8 to 11 years which includ...,requirements\r\nmust have completed completed ...,information technology
8,8,"executive, digital analytics",carat media services singapore pte ltd,"guoco tower, 1 wallich street 078881","$3,500to$7,000",monthly,executive,"background:\r\niprospect is a global, award-wi...",requirements\r\nstrong logic & quantitative sk...,advertising / media
9,9,"manager / senior manager, digital analytics",carat media services singapore pte ltd,"guoco tower, 1 wallich street 078881","$5,000to$10,000",monthly,manager,"background:\r\niprospect is a global, award-wi...",requirements\r\nstrong logic & quantitative sk...,advertising / media


In [8]:
df.drop(columns ='Unnamed: 0', inplace=True)


In [9]:
#Examine any null values, left 645 rows after removing duplicates
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 645 entries, 0 to 693
Data columns (total 9 columns):
Title             645 non-null object
Company           645 non-null object
Location          645 non-null object
Range             645 non-null object
Monthly/Annual    645 non-null object
Seniority         645 non-null object
JD                645 non-null object
Req               645 non-null object
Industry          645 non-null object
dtypes: object(9)
memory usage: 50.4+ KB


In [10]:
# I am working on only 577 Salary rows, may not be sufficient but will work on it first
(df.Range != 'undisclosed').sum()

577

In [11]:
#Drop undisclosed salary range
df= df[df['Range'] != 'undisclosed']
 

In [13]:
# I want to create 2 columns for the range of salaries, low and high

df['Range'] = df['Range'].replace({'\$':''}, regex = True)
df = df.join(df['Range'].str.split('to', expand=True).rename(columns = {0:'Low', 1:'High'}))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
df.Low = df.Low.str.replace(',','').astype(int)
df.High = df.High.str.replace(',','').astype(int)

In [15]:
# creating an Average salary column as the salary for individual job.
df = df.assign(Average = df[['Low', 'High']].mean(axis=1))

In [16]:
# Converting the monthly salary to annually

df.loc[df['Monthly/Annual']== 'monthly', 'Low']= df.loc[df['Monthly/Annual']== 'monthly', 'Low'] *12
df.loc[df['Monthly/Annual']== 'monthly', 'High']= df.loc[df['Monthly/Annual']== 'monthly', 'High'] *12
df.loc[df['Monthly/Annual']== 'monthly', 'Average']= df.loc[df['Monthly/Annual']== 'monthly', 'Average'] *12

In [19]:
df = df.reset_index(drop=True)

In [20]:
df.drop(columns='Range', inplace= True)

In [21]:
# converting the JD and Req columns to continuos sentence instead of mutiple lines
df['JD'].replace('\r\n', '',inplace= True, regex = True)
df['Req'].replace('\r\n', ' ',inplace= True, regex = True)


In [52]:
df.JD[0]

'you would be responsible to maintain and grow the business by paying attention detail from business support to sales. it is important to have passion in the business, and love to communicate with people. a successful business relationship manager in our company uses business management, investing and marketing knowledge to meet our business kpi. you should also advise our management team on potential growth and strategic decisions in alignment with business objectives. ultimately, you should recommend, design and execute short-term and long-term business strategies for our company.therefore there are various aspects, you need to able to cover:50% business & customer supportassisting customer enquiriesconducting workshopconducting cost and benchmarking analysis and report30% salesorganizing seminars and workshopconducting face to face saleconducting tele-salesgenerating new business leadgenerating new customer lead10% marketing (online/offline)do bloggingpost facebook and google adsana

In [23]:
# Notice the high value has a $840000 annual salary, looks weird, have to investigate if not it will skew the result
df.describe()

Unnamed: 0,Low,High,Average
count,577.0,577.0,577.0
mean,70358.960139,116679.708839,93519.334489
std,31299.753901,60178.513895,43166.000658
min,4800.0,10200.0,7500.0
25%,48000.0,80400.0,65730.0
50%,60000.0,108000.0,84000.0
75%,84000.0,144000.0,117000.0
max,180000.0,840000.0,444000.0


In [24]:
df[df.High == 840000]

Unnamed: 0,Title,Company,Location,Monthly/Annual,Seniority,JD,Req,Industry,Low,High,Average
554,manager,social room concepts pte. ltd.,59 ubi avenue 1 408938,monthly,"senior management, manager",the manager/senior manager uses data analytics...,requirements ~ analyse data for business insig...,f&b,48000,840000,444000.0


In [25]:
# Manually change the high and average salary due to typo error
df.iloc[554, 9] = 84000


In [26]:
df.iloc[554, 10] = 66000

In [27]:
df.describe()

Unnamed: 0,Low,High,Average
count,577.0,577.0,577.0
mean,70358.960139,115369.483536,92864.221837
std,31299.753901,52089.04011,40631.641877
min,4800.0,10200.0,7500.0
25%,48000.0,80400.0,65730.0
50%,60000.0,108000.0,84000.0
75%,84000.0,144000.0,117000.0
max,180000.0,360000.0,270000.0


In [28]:
# Lowest salary seems fine as it is a intern position
df[df.High == 10200]

Unnamed: 0,Title,Company,Location,Monthly/Annual,Seniority,JD,Req,Industry,Low,High,Average
139,product development intern,invigor asia pte. ltd.,79 ayer rajah crescent 139955,monthly,fresh/entry level,product development intern: part-timeinvigor i...,requirements bout you: you are passionate abou...,information technology,4800,10200,7500.0


In [29]:
# Highest salary seems fine as it is a senior director position
df[df.High == 360000]

Unnamed: 0,Title,Company,Location,Monthly/Annual,Seniority,JD,Req,Industry,Low,High,Average
150,"senior director, global security",company undisclosed,undisclosed,monthly,non-executive,"as the senior director of global security, you...",requirements key skills and competencies •seas...,"engineering, manufacturing",180000,360000,270000.0
558,"svp, head of regional risk & control, institut...",dbs bank ltd.,undisclosed,monthly,"senior management, manager",job purpose responsible for overall ops risk c...,requirements university degree with at least 1...,banking and finance,180000,360000,270000.0


In [30]:
# Creating a binary High Salary column as the target variable
median_salary = df['Average'].median()
median_salary

84000.0

In [31]:
df['high_salary'] = [1 if s > median_salary else 0 for s in df.Average]

In [32]:
df['high_salary'].value_counts()

0    301
1    276
Name: high_salary, dtype: int64

In [34]:
print('Baseline accuracy: {} '.format(1 - np.mean(df.high_salary)))

Baseline accuracy: 0.5216637781629117 


Baseline accuracy is 52% to predict the salary will be below or above the median salary of $81000

Building my first model using dummy variable of Industry column

In [35]:
df_industry_dummy = pd.get_dummies(df['Industry'])
df_industry_dummy.head()

Unnamed: 0,accounting / auditing / taxation,"accounting / auditing / taxation, banking and finance","accounting / auditing / taxation, consulting , banking and finance, insurance","accounting / auditing / taxation, education and training, professional services","accounting / auditing / taxation, engineering, logistics / supply chain, others","accounting / auditing / taxation, information technology","accounting / auditing / taxation, marketing / public relations",admin / secretarial,"admin / secretarial, human resources",advertising / media,...,others,"others, professional services",personal care / beauty,professional services,public / civil service,purchasing / merchandising,risk management,sales / retail,sciences / laboratory / r&d,telecommunications
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
X = df_industry_dummy
y = df['high_salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=88)

In [37]:

model = RandomForestClassifier(n_estimators=100, oob_score=True)

model.fit(X_train, y_train)
predicted = model.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
print('Out-of-bag score estimate: {} '.format(model.oob_score_))
print('Accuracy score: {}'.format(accuracy))

Out-of-bag score estimate: 0.6010362694300518 
Accuracy score: 0.6178010471204188


Industy feature seems to be predicting better then the baseline score, accuracy increase from 52% to 61%.


In [38]:
industry_importance = pd.DataFrame(model.feature_importances_, index = X_train.columns,
                                   columns = ['importance']).sort_values('importance', ascending = False)
industry_importance.head(10)

Unnamed: 0,importance
banking and finance,0.113569
sciences / laboratory / r&d,0.111101
consulting,0.078692
information technology,0.047668
education and training,0.033813
advertising / media,0.031434
f&b,0.031326
general management,0.01928
"accounting / auditing / taxation, banking and finance",0.017877
human resources,0.017522


In [None]:
The features importance matrix shows that banking and finance, sciences, laboratory and r&d carry higher importance
in this prediction. 

In [39]:
df.Seniority.value_counts()

professional                                                     161
executive                                                        105
manager                                                           93
senior executive                                                  47
middle management                                                 38
non-executive                                                     28
senior management                                                 22
junior executive                                                  17
middle management, manager                                        11
fresh/entry level                                                 10
manager, professional                                              6
executive, junior executive                                        4
senior management, manager                                         4
executive, senior executive                                        3
undisclosed                       

In [40]:
# For the Seniority column, some of them have multiple names, i will retain only the first.
df['Seniority'] = df['Seniority'].apply(lambda x: x.split(',')[0])


In [41]:
df.Seniority.value_counts()

professional         164
executive            113
manager              103
middle management     54
senior executive      47
senior management     32
non-executive         28
junior executive      18
fresh/entry level     15
undisclosed            3
Name: Seniority, dtype: int64

In [42]:
seniority_features = pd.get_dummies(df['Seniority'])
seniority_features.head()

Unnamed: 0,executive,fresh/entry level,junior executive,manager,middle management,non-executive,professional,senior executive,senior management,undisclosed
0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0


Building the second model using adding Seniority to industry dummy variables, accuracy score only increase slightly from 61% to 63%. It seems Seniority does not help in predicting the Salary. Perhaps the Seniority columns need to do more feature engineering, like combining the all executives as 1, likewise for management. 

In [43]:
#After adding the seniority features, the accuracy did not increase much

model2 = RandomForestClassifier(n_estimators=100, oob_score=True)

X_features = pd.concat([df_industry_dummy, seniority_features], axis=1)
y = df['high_salary']

X_fea_train, X_fea_test, y_fea_train, y_fea_test = train_test_split(X_features, y, test_size=0.33, random_state=100)
model2.fit(X_fea_train, y_fea_train)


ind_seniority_pred = model2.predict(X_fea_test)
accuracy = accuracy_score(y_fea_test, ind_seniority_pred)
print('Out-of-bag score estimate: {} '.format(model2.oob_score_))
print('Accuracy score: {}'.format(accuracy))

Out-of-bag score estimate: 0.6787564766839378 
Accuracy score: 0.6387434554973822


In [44]:
ind_seniority_importance = pd.DataFrame(model2.feature_importances_, index = X_fea_train.columns,
                                   columns = ['importance']).sort_values('importance', ascending = False)
ind_seniority_importance.head(10)

Unnamed: 0,importance
manager,0.073993
banking and finance,0.072229
sciences / laboratory / r&d,0.054534
information technology,0.053369
middle management,0.050868
executive,0.050125
senior management,0.038962
consulting,0.038473
marketing / public relations,0.034381
fresh/entry level,0.029989


In [None]:
Model 3 will require to count vectorize Job Title and Job description , i will set the max features as 30.
I will use logistic regression and random forest model

In [45]:
cvec_title = CountVectorizer(stop_words='english', max_features=30, ngram_range=(2,2))
title_vec = cvec_title.fit_transform(df['Title'].values)

df_vec  = pd.DataFrame(title_vec.todense(), columns=cvec_title.get_feature_names())
print(df_vec.shape)
df_vec.head()


(577, 30)


Unnamed: 0,assistant manager,associate engineer,banking big,big data,business analyst,consumer banking,data analyst,data analytics,data engineer,data scientist,...,principal engineer,product manager,research engineer,senior data,senior manager,senior product,senior software,software engineer,tech lead,vp avp
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
lr = LogisticRegression()
lr = lr.fit(df_vec, y)
scores= cross_val_score(lr, df_vec, y, cv=10)
print('Cross-validated scores:', scores)
print('Average score:', scores.mean())
print('Standard deviation of score:', scores.std())


Cross-validated scores: [0.69491525 0.55172414 0.55172414 0.77586207 0.79310345 0.74137931
 0.56140351 0.54385965 0.64912281 0.59649123]
Average score: 0.645958555066802
Standard deviation of score: 0.09371202888506953


In [None]:
Building a Logistic Regression model on jobtitle alone to see if any job title can give me insights on predicting
salary. The accuracy score is quite good, the score of 64% is equivalent to Industry and Seniority combined.

In [48]:
cvec_title.get_feature_names()

['assistant manager',
 'associate engineer',
 'banking big',
 'big data',
 'business analyst',
 'consumer banking',
 'data analyst',
 'data analytics',
 'data engineer',
 'data scientist',
 'deep learning',
 'engineer android',
 'engineer backend',
 'engineer frontend',
 'engineer fullstack',
 'engineer ios',
 'group consumer',
 'i2r star',
 'intern associate',
 'marketing executive',
 'principal engineer',
 'product manager',
 'research engineer',
 'senior data',
 'senior manager',
 'senior product',
 'senior software',
 'software engineer',
 'tech lead',
 'vp avp']

In [49]:
lr.coef_[0]

array([-0.76646457, -2.01611692,  0.36798343,  0.86039079, -0.57169021,
        1.18169489, -0.11530631,  0.2865813 , -1.02960305,  0.11288433,
       -0.18706298,  0.46394555,  0.27480815,  0.46394555,  0.46394555,
        0.46394555,  0.29222498, -1.57381383, -0.70255528, -1.29147422,
        1.34462508,  0.80312218, -0.86007192,  1.46210483,  1.2322598 ,
       -0.28423417,  0.80511955,  1.14382091,  1.74718927,  0.77756859])

In [53]:
title_coef= list(zip(cvec_title.get_feature_names(), lr.coef_[0]))
title_coef

[('assistant manager', -0.7664645652268465),
 ('associate engineer', -2.01611691979756),
 ('banking big', 0.3679834300448122),
 ('big data', 0.8603907897965788),
 ('business analyst', -0.5716902074055272),
 ('consumer banking', 1.1816948891407628),
 ('data analyst', -0.11530630822198913),
 ('data analytics', 0.2865813002124379),
 ('data engineer', -1.0296030454244718),
 ('data scientist', 0.11288433238382552),
 ('deep learning', -0.1870629823740055),
 ('engineer android', 0.4639455498265709),
 ('engineer backend', 0.2748081545902683),
 ('engineer frontend', 0.4639455498265709),
 ('engineer fullstack', 0.4639455498265709),
 ('engineer ios', 0.4639455498265709),
 ('group consumer', 0.2922249779589898),
 ('i2r star', -1.5738138333852736),
 ('intern associate', -0.7025552785816193),
 ('marketing executive', -1.291474218618601),
 ('principal engineer', 1.3446250774553572),
 ('product manager', 0.8031221764481857),
 ('research engineer', -0.86007191852812),
 ('senior data', 1.462104826395172

By looking at the coefficents , the positive coefficients indicate that the job being in
big data, product manager, senior data, senior software and tech lead will be more likely to being a job 
paying over our median level, whilst the negative coefficients of associate engineer, data engineer, i2r star,
marketing executive, research engineer indicate that the job is more likely to be below our median level.

The reason of some coefficients to be abnormally large perhaps is due to the small in sample size. 

In [54]:
cvec_jd = CountVectorizer(stop_words='english', max_features=30)
jd_vec = cvec_jd.fit_transform(df['JD']).toarray()
df_jd  = pd.DataFrame(jd_vec, columns=cvec_jd.get_feature_names())
df_jd.head()

Unnamed: 0,analytics,business,clients,customer,data,design,develop,development,digital,ensure,...,provide,research,role,services,solutions,support,team,teams,technology,work
0,0,12,0,3,0,1,0,0,0,0,...,0,0,0,0,0,1,2,0,0,0
1,1,0,0,0,3,1,0,0,0,0,...,0,0,0,0,2,1,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,3
3,1,0,0,0,3,1,0,0,0,0,...,0,0,0,0,2,1,0,0,0,0
4,1,7,0,0,9,4,2,3,0,1,...,1,0,0,0,1,1,2,1,0,2


In [55]:
X_final = pd.concat([df_vec, df_jd, X_features], axis=1)
X_final.shape

(577, 151)

In [56]:
param_grid = {'n_estimators': [50, 100, 200],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [10, 20, 30, 40, 50, 60],
               'min_samples_split': [ 2, 4],
               'min_samples_leaf': [2, 5, 10]}

In [60]:
# Using grid search to optimize the random forest model, the final score is 74% which has increase a lot
#compared to the baseline score.

rf = RandomForestClassifier()
y = df['high_salary']

X_final_train, X_final_test, y_final_train, y_final_test = train_test_split(X_final, y, test_size=0.33, random_state=99)

rf_gridsearch = GridSearchCV(estimator = rf, param_grid = param_grid, 
                             cv = 3, n_jobs = -1)

rf_gridsearch.fit(X_final_train, y_final_train)


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [50, 100, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60], 'min_samples_split': [2, 4], 'min_samples_leaf': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [61]:
prediction = rf_gridsearch.predict(X_final_test)
accuracy = accuracy_score(y_final_test, prediction)

print('Accuracy score: {}'.format(accuracy))

Accuracy score: 0.743455497382199


In [94]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_final_test, prediction)
print(confusion_matrix)
print(classification_report(y_final_test, prediction))


[[80 29]
 [20 62]]
             precision    recall  f1-score   support

          0       0.80      0.73      0.77       109
          1       0.68      0.76      0.72        82

avg / total       0.75      0.74      0.74       191



In [None]:
From the confusion matrix, it shows the model captured most of the True positive and True Negative which means
the model is able to predict most of the salary that is higher or lower then the median salary. 
Precision and Recall is above 0.74 which also another indicator that the model is doing good.

# QUESTION 2: Factors that distinguish job category

Using the job postings you scraped for part 1 (or potentially new job postings from a second round of scraping), identify features in the data related to job postings that can distinguish job titles from each other. There are a variety of interesting ways you can frame the target variable, for example:
- What components of a job posting distinguish data scientists from other data jobs?
- What features are important for distinguishing junior vs. senior positions?
- Do the requirements for titles vary significantly with industry (e.g. healthcare vs. government)?

You may end up making multiple classification models to tackle different questions. Be sure to clearly explain your hypotheses and framing, any feature engineering, and what your target variables are. The type of classification model you choose is up to you. Be sure to interpret your results and evaluate your models' performance.

I intend to combine data analysis, analytics and scientist as my target variable due to insufficent data. 
I will use 'Req' which represent the skills for the job title as my predictor variable to predict the job title.

In [96]:
df['data_analyst'] = df['Title'].str.contains('analyst|analytics|scientist').apply(lambda x:1 if x == True else 0)

In [97]:
df['data_analyst'].value_counts()

0    416
1    161
Name: data_analyst, dtype: int64

In [98]:
print('Baseline accuracy: {} '.format(1 -np.mean(df.data_analyst)))

Baseline accuracy: 0.7209705372616985 


In [99]:
df.head(1)

Unnamed: 0,Title,Company,Location,Monthly/Annual,Seniority,JD,Req,Industry,Low,High,Average,high_salary,data_analyst
0,business relationship manager,algomerchant pte. ltd.,"paya lebar square, 60 paya lebar road 409051",monthly,executive,you would be responsible to maintain and grow ...,requirements bsc/ba in banking and finance or ...,"banking and finance, education and training, i...",30000,42000,36000.0,0,0


In [100]:
cvec_req = CountVectorizer(stop_words='english', max_features= 100)
req_vec = cvec_req.fit_transform(df['Req']).toarray()
df_req  = pd.DataFrame(req_vec, columns=cvec_req.get_feature_names())
df_req.head()

Unnamed: 0,ability,able,advantage,agile,analysis,analytical,analytics,applications,bachelor,background,...,time,tools,understanding,using,verbal,web,work,working,written,years
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#Building a Random Forest model using the skils requirment to predict the job title 

In [126]:
rf = RandomForestClassifier(class_weight="balanced")
X_req = df_req 
y_req = df['data_analyst'].values

X_req_train, X_req_test, y_req_train, y_req_test = train_test_split(X_req, y_req, test_size=0.33, random_state=77)

rf.fit(X_req_train, y_req_train)


req_pred = rf.predict(X_req_test)
accuracy = accuracy_score(y_req_test, req_pred)

print('Accuracy score: {}'.format(accuracy))


Accuracy score: 0.8115183246073299


In [112]:
skills_importance = pd.DataFrame(rf.feature_importances_, index = X_req_train.columns,
                                   columns = ['importance']).sort_values('importance', ascending = False)
skills_importance.head(10)

Unnamed: 0,importance
analysis,0.055445
machine,0.048129
analytics,0.047039
python,0.043042
data,0.036707
solving,0.027564
experience,0.02268
learning,0.020939
years,0.020121
tools,0.020016


In [113]:
lr = LogisticRegression()
lr = lr.fit(X_req_train, y_req_train)
print('Cross-validated scores:', scores)
print('Average score:', scores.mean())
print('Standard deviation of score:', scores.std())


Cross-validated scores: [0.69491525 0.55172414 0.55172414 0.77586207 0.79310345 0.74137931
 0.56140351 0.54385965 0.64912281 0.59649123]
Average score: 0.645958555066802
Standard deviation of score: 0.09371202888506953


Summary:
    - I have used Random forest throughout the modelling because it is also robust to outliers,
    it works well with classification problem and do not need to normalize the numerical features.
    It could handle inbalanced class problem by introducing class weight= 'balanced' to give bigger weights
    to minority class. Overall, it give a good accuracy and quite dynamic.
    
    - As for the features that give higher importance to the prediction are skills 
    like analysis, machine, python and data, it gives higher probability of classifying the job title
    of data analayst/data scientist
    
    