In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import regex as re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
df = pd.read_csv('final_df')
df.reset_index(drop=True, inplace=True)
df.drop(columns=['Unnamed: 0'], inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2104 entries, 0 to 2103
Data columns (total 6 columns):
job_category        2104 non-null object
job_title           2104 non-null object
company_name        2104 non-null object
location            2078 non-null object
summary             2104 non-null object
salary_high_tier    2104 non-null int64
dtypes: int64(1), object(5)
memory usage: 98.7+ KB


In [3]:
df

Unnamed: 0,job_category,job_title,company_name,location,summary,salary_high_tier
0,data_scientist,data scientist,indeed,,significant prior success as a data scientist ...,1
1,data_scientist,data scientist,capita singapore,,data scientist data scientist needed to impr...,1
2,data_scientist,growth strategy operations strategic planning...,wework,,ensuring data quality minimum years of experi...,0
3,data_scientist,data scientist,gateway search pte ltd,,assess the effectiveness and accuracy of new d...,1
4,data_scientist,data scientists machine learning,biofourmis singapore,singapore,knowledge in big data technologies including c...,0
5,data_scientist,data scientist aml group customer analytics ...,ocbc bank,singapore,data scientist aml group customer analytics ...,1
6,data_scientist,data engineer data science,twitter,singapore,data engineers work alongside data scientists ...,0
7,data_scientist,data scientist,zyllem,singapore,interpreting data analyzing results using stat...,1
8,data_scientist,data scientist,lenddoefl,singapore,proven experience in data manipulation as a da...,1
9,data_scientist,data scientist,cxa group pte. limited,singapore,leverage data visualization techniques and too...,1


In [4]:
data_jobs = df[df.job_title.str.contains('data')]
non_data_jobs = df[~df.job_title.str.contains('data')]
data_jobs.shape

(667, 6)

In [5]:
ana_jobs = non_data_jobs[non_data_jobs.job_title.str.contains('analyst')]
non_ana_jobs = non_data_jobs[~non_data_jobs.job_title.str.contains('analyst')]
ana_jobs.shape

(1109, 6)

In [6]:
eng_jobs = non_ana_jobs[(non_ana_jobs.job_title.str.contains('engineer')) | (non_ana_jobs.job_title.str.contains('database'))]
non_eng_jobs = non_ana_jobs[(~non_ana_jobs.job_title.str.contains('engineer')) & (~non_ana_jobs.job_title.str.contains('database'))]
eng_jobs.shape

(54, 6)

In [7]:
man_jobs = non_eng_jobs[non_eng_jobs.job_title.str.contains('manager')]
non_man_jobs = non_eng_jobs[~non_eng_jobs.job_title.str.contains('manager')]
man_jobs.shape

(38, 6)

In [30]:
df_title = pd.DataFrame()
data_jobs.job_title = data_jobs.job_title.map(lambda x: 'data_jobs')
ana_jobs.job_title = ana_jobs.job_title.map(lambda x: 'analyst_jobs')
eng_jobs.job_title = eng_jobs.job_title.map(lambda x: 'engineer_jobs')
man_jobs.job_title = man_jobs.job_title.map(lambda x: 'manager_jobs')
non_man_jobs.job_title = non_man_jobs.job_title.map(lambda x: 'other_jobs')

In [31]:
df_title['job_titles'] = pd.concat([data_jobs.job_title,ana_jobs.job_title,
                        eng_jobs.job_title,man_jobs.job_title,
                        non_man_jobs.job_title], ignore_index=True)

In [32]:
final_df = pd.concat([df[['company_name','summary','salary_high_tier']], df_title.job_titles], axis=1)

In [33]:
# convert job_title into numbers
final_df.job_titles = final_df.job_titles.map(lambda x: 1 if x == 'data_jobs' else
                                             2 if x == 'analyst_jobs' else
                                             3 if x == 'engineer_jobs' else
                                             4 if x == 'manager_jobs' else 5)

In [34]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2104 entries, 0 to 2103
Data columns (total 4 columns):
company_name        2104 non-null object
summary             2104 non-null object
salary_high_tier    2104 non-null int64
job_titles          2104 non-null int64
dtypes: int64(2), object(2)
memory usage: 65.8+ KB


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [36]:
# Get TFIDF for company name
job_company_tvec = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_company_tvec.fit(final_df.company_name)
job_company_tvec_df = pd.DataFrame(job_company_tvec.transform(final_df.company_name).todense(),
                       columns=['company_[' + f + ']' for f in job_company_tvec.get_feature_names()])

In [37]:
# Get TFIDF for job summary
job_summary_tvec = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_summary_tvec.fit(final_df.summary)
job_summary_tvec_df = pd.DataFrame(job_summary_tvec.transform(final_df.summary).todense(),
                       columns=['summary_[' + f + ']' for f in job_summary_tvec.get_feature_names()])

In [38]:
X = pd.concat([final_df[['salary_high_tier']], job_company_tvec_df, job_summary_tvec_df], axis=1)
y = final_df['job_titles'].values.ravel()

In [39]:
# Get training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [40]:
# Standardize predictors
X_train_ss = StandardScaler().fit_transform(X_train)
X_test_ss = StandardScaler().fit_transform(X_test)

In [41]:
X_train_ss = pd.DataFrame(X_train_ss, columns=X_train.columns)
X_test_ss = pd.DataFrame(X_test_ss, columns=X_train.columns)

In [42]:
# Gridsearch for Ridge and Lasso Logistic Regression, optimize C
parameters = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

print ("GRID SEARCH:")
lr_grid_search = GridSearchCV(LogisticRegression(), parameters, cv=10, verbose=0)
lr_grid_search.fit(X_train_ss, y_train)
print ("Best parameters set:")
lr_best_parameters = lr_grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, lr_best_parameters[param_name]))

GRID SEARCH:
Best parameters set:
	C: 0.0774263682681127
	penalty: 'l1'
	solver: 'liblinear'


In [43]:
print ("Logistic Regression with best parameter:")
clf = LogisticRegression(**lr_best_parameters)
clf.fit(X_train_ss, y_train)
lr_gs_pred = clf.predict(X_test_ss)
print(metrics.classification_report(y_test, lr_gs_pred, labels=[1,2,3,4,5],
                                    target_names=['data jobs','analyst jobs','engineer jobs','manager jobs','other jobs']))

Logistic Regression with best parameter:
               precision    recall  f1-score   support

    data jobs       0.71      0.22      0.33       219
 analyst jobs       0.54      0.96      0.69       318
engineer jobs       0.00      0.00      0.00        17
 manager jobs       0.00      0.00      0.00         8
   other jobs       0.00      0.00      0.00        70

  avg / total       0.52      0.56      0.46       632



  'precision', 'predicted', average, warn_for)


In [44]:
from sklearn.tree import DecisionTreeClassifier

In [45]:
# gridsearch params
dtc_params = {
    'max_depth':[None,1,2,3,4],
    'max_features':[None,'log2','sqrt',2,3,4,5],
    'min_samples_split':[2,3,4,5,10,15,20,25,30,40,50]
}

# set the gridsearch
dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_params, cv=5, verbose=1)

In [46]:
# use the gridsearch C model to fit the data
dtc_gs.fit(X_train_ss, y_train)

Fitting 5 folds for each of 385 candidates, totalling 1925 fits


[Parallel(n_jobs=1)]: Done 1925 out of 1925 | elapsed:   11.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None, 1, 2, 3, 4], 'max_features': [None, 'log2', 'sqrt', 2, 3, 4, 5], 'min_samples_split': [2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [47]:
# Best Estimator
dtc_best = dtc_gs.best_estimator_
print(dtc_gs.best_params_)
print(dtc_gs.best_score_)

{'max_depth': 4, 'max_features': None, 'min_samples_split': 10}
0.5998641304347826


In [48]:
pred = dtc_best.predict(X_test_ss)
print(metrics.classification_report(y_test, pred, labels=[1,2,3,4,5], target_names=['data jobs','analyst jobs','engineer jobs','manager jobs','other jobs']))

               precision    recall  f1-score   support

    data jobs       0.68      0.22      0.34       219
 analyst jobs       0.54      0.95      0.69       318
engineer jobs       0.00      0.00      0.00        17
 manager jobs       0.00      0.00      0.00         8
   other jobs       1.00      0.03      0.06        70

  avg / total       0.62      0.56      0.47       632



  'precision', 'predicted', average, warn_for)


In [49]:
fi = pd.DataFrame({
        'feature':X_train_ss.columns,
        'importance':dtc_best.feature_importances_
    })

fi.sort_values('importance', ascending=False, inplace=True)
fi.head(10)

Unnamed: 0,feature,importance
44,summary_[scientist],0.543288
27,summary_[analyst],0.127383
43,summary_[science],0.126711
33,summary_[data analytics],0.065149
24,company_[technologies],0.052376
35,summary_[experience],0.036833
29,summary_[business],0.020528
41,summary_[requirements],0.018492
32,summary_[data analyst],0.009241
36,summary_[financial],0.0


In [50]:
coef = lr_grid_search.best_estimator_.coef_

lr_coef_data = pd.DataFrame({'coef':coef[0],
                    'mag':np.abs(coef[0]),
                    'pred':X_test.columns})

lr_coef_analyst = pd.DataFrame({'coef':coef[1],
                    'mag':np.abs(coef[1]),
                    'pred':X_test.columns})

lr_coef_engineer = pd.DataFrame({'coef':coef[2],
                    'mag':np.abs(coef[2]),
                    'pred':X_test.columns})

lr_coef_manager = pd.DataFrame({'coef':coef[3],
                    'mag':np.abs(coef[3]),
                    'pred':X_test.columns})

lr_coef_other = pd.DataFrame({'coef':coef[4],
                    'mag':np.abs(coef[4]),
                    'pred':X_test.columns})

lr_coef_data.sort_values('mag', ascending=False, inplace=True)
lr_coef_analyst.sort_values('mag', ascending=False, inplace=True)
lr_coef_engineer.sort_values('mag', ascending=False, inplace=True)
lr_coef_manager.sort_values('mag', ascending=False, inplace=True)
lr_coef_other.sort_values('mag', ascending=False, inplace=True)

In [51]:
# Top predictors for data jobs
lr_coef_data.head(10)

Unnamed: 0,coef,mag,pred
44,0.710262,0.710262,summary_[scientist]
27,-0.38563,0.38563,summary_[analyst]
43,0.264716,0.264716,summary_[science]
29,0.199496,0.199496,summary_[business]
9,-0.157887,0.157887,company_[google]
33,0.155794,0.155794,summary_[data analytics]
39,-0.121689,0.121689,summary_[management]
46,-0.073316,0.073316,summary_[systems]
37,-0.064041,0.064041,summary_[insights]
19,0.057494,0.057494,company_[singapore]


In [52]:
# Top predictors for analyst jobs
lr_coef_analyst.head(10)

Unnamed: 0,coef,mag,pred
44,-0.665745,0.665745,summary_[scientist]
27,0.179863,0.179863,summary_[analyst]
43,-0.178767,0.178767,summary_[science]
9,0.075656,0.075656,company_[google]
37,0.070916,0.070916,summary_[insights]
31,0.05344,0.05344,summary_[data analysis]
39,0.049047,0.049047,summary_[management]
16,-0.044687,0.044687,company_[pte]
33,-0.044463,0.044463,summary_[data analytics]
47,-0.041786,0.041786,summary_[team]


In [53]:
# Top predictors for engineer jobs
lr_coef_engineer.head(10)

Unnamed: 0,coef,mag,pred
0,0.0,0.0,salary_high_tier
38,0.0,0.0,summary_[looking]
28,0.0,0.0,summary_[analytics]
29,0.0,0.0,summary_[business]
30,0.0,0.0,summary_[business analyst]
31,0.0,0.0,summary_[data analysis]
32,0.0,0.0,summary_[data analyst]
33,0.0,0.0,summary_[data analytics]
34,0.0,0.0,summary_[data scientist]
35,0.0,0.0,summary_[experience]


In [54]:
# Top predictors for manager jobs
lr_coef_manager.head(10)

Unnamed: 0,coef,mag,pred
4,0.19995,0.19995,company_[bank]
38,0.052768,0.052768,summary_[looking]
9,0.017682,0.017682,company_[google]
2,0.007842,0.007842,company_[asia]
0,0.0,0.0,salary_high_tier
39,0.0,0.0,summary_[management]
29,0.0,0.0,summary_[business]
30,0.0,0.0,summary_[business analyst]
31,0.0,0.0,summary_[data analysis]
32,0.0,0.0,summary_[data analyst]


In [55]:
# Top predictors for other jobs
lr_coef_other.head(10)

Unnamed: 0,coef,mag,pred
29,-0.188175,0.188175,summary_[business]
32,0.139338,0.139338,summary_[data analyst]
16,0.138465,0.138465,company_[pte]
14,0.108968,0.108968,company_[limited]
33,-0.090046,0.090046,summary_[data analytics]
24,0.085544,0.085544,company_[technologies]
30,0.085246,0.085246,summary_[business analyst]
44,-0.081667,0.081667,summary_[scientist]
1,0.078941,0.078941,company_[ambition]
42,-0.067197,0.067197,summary_[research]


In [None]:
# That's all I can muster up, I feel very inadequate hahaha, not enough competency to do this within the time. :(