In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
import statsmodels.formula.api as sm
import patsy
import itertools
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score, auc
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, GridSearchCV,learning_curve
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 
from sklearn.pipeline import Pipeline

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from string import printable

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yesplum/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Ingestion

In [4]:
import glob, os

extension = 'csv' 
src_path = '../data_src/'
output_path = '../output/'

all_filenames = [i for i in glob.glob('{}*.{}'.format(src_path,extension))] 
print(all_filenames)

#combine all files in the list 

df_raw = pd.concat([pd.read_csv(f, encoding='unicode escape',skiprows=0) for f in all_filenames ]) 
df_raw.reset_index(inplace=True) 
df_raw = df_raw.drop(columns=['index','Unnamed: 0']) 

['../data_src/DS_27Mar2020a.csv', '../data_src/DS_27Mar2020b.csv', '../data_src/ML_27Mar2020b.csv', '../data_src/ML_27Mar2020a.csv', '../data_src/AI_26Mar2020.csv']


## Data Cleaning

In [5]:
def data_clean(df):
    
    #remove duplicate based on Job ID
    job_clean = df_raw.drop_duplicates(subset='Job_Id', keep='first')
    #drop Salary_Type due to only one unique value 'Monthly'
    job_clean = job_clean.drop(columns='Salary_Type')
    #remove job without title
    job_no_title = job_clean['Job_Title'] == ''
    job_clean = job_clean[~job_no_title]
    #remove row with all NaN value
    job_clean[job_clean.isnull().any(axis=1)]
    job_clean = job_clean.dropna()
    
    #perform data cleaning on every row and columms
    clean_list = "(\[|\]|b'|Requirements|'|amp;|xa0|\\\|xe2x80x93|\\n|div class=|div class=|span class=|dib|lh-solid|/span|f5-5 i fw4 gray|f5 fw4 ph1|<|>|/div|\")"
    for col in job_clean.columns.difference(['Requirements']):
        job_clean[col]=job_clean[col].str.replace(clean_list, "")

    #space remain for Requirements column    
    job_clean['Requirements']=job_clean['Requirements'].str.replace(clean_list, " ")

    #remove all non-ascii char except punctuation, digits, ascii_letters and whitespace
    job_clean['Requirements'] = job_clean['Requirements'].apply(lambda y: ''.join(filter(lambda x: x in printable, y)))
    
    #further remove job with same data from all columns
    job_clean = job_clean.drop_duplicates(subset=job_clean.columns, keep='first') 
        
    #further filter on job title with specific keywords
    title_key = ['DATA', 'MACHINE','ANALYST','MACHINE LEARNING','ANALYTICS', "SCIENCE", '4.0','APPLICATION'
             'DEEP LEARNING','RESEARCH','NLP', 'ARTIFICIAL', "INTELLIGENT", 'AI', 'SCIENTIST','SYSTEM'
             'Industry', 'IOT', 'FINANCE', 'FINTECH', 'SOFTWARE', 'ENGINEER', 'ENGINEERING','PROFESSOR'
             'BUSINESS', 'DEVELOPER', 'INDUSTRIAL','AUTOMATION', 'CLOUD','SOLUTION','ARCHITECT',
             'MANAGER','VP','PRESIDENT', 'TECHNOLOGY', 'SPECIALIST', 'TECHNICAL','LEAD','TECHNOLOGIST']
    key = '|'.join(title_key)
    data_job = job_clean['Job_Title'].str.upper().str.contains(key)
    job_clean = job_clean[data_job]

    #remove job title with unwanted keywords
    title_key = ['PHYSIOTHERAPIST','ACCOUNT','AUDIT','COUNSEL','EXECUTIVE','SALES','GENERAL','MARKET',
                 'ELECTRICAL','BUSINESS','ADMIN','CUSTOMER','OFFICER','OPERATION', 'MECHANICAL','CHEMICAL',
                 'COORDINATOR','LECTURER','TECHNICIAN']
    key = '|'.join(title_key)
    non_data_job = job_clean['Job_Title'].str.upper().str.contains(key)
    job_clean = job_clean[~non_data_job]
    
    #remove job with multiple category
    cat_list = "(/|and)"
    job_clean['Category']=job_clean['Category'].str.replace(cat_list, ",")
    job_clean['Cat_num'] = job_clean['Category'].str.count(',')
    
    multiple_cat = job_clean['Cat_num']>5
    job_clean = job_clean[~multiple_cat]
    job_clean = job_clean.drop(columns='Cat_num')
    
    #remove job with no or multiple seniority
    senior_rule = (job_clean['Seniority'].str.count(',')>=1) | (job_clean['Seniority']=='')
    job_clean = job_clean[~senior_rule]

    
    #remove job cat with specific keywords
    rare_cat_key = ['HUMAN','SOCIAL','THERAPY','TAXATION','CUSTOMER','INTERIOR', 'ADMIN','BUILDING',
                    'SECRETARIAL','INVESTIGATION', 'AUDITING', 'ENVIRONMENT','SALES', 'MARKETING',
                    'ADVERTISING','CONSTRUCTION', 'DESIGN','LEGAL','HOSPITALITY','PROFESSIONAL']
    key = '|'.join(rare_cat_key)
    rare_cat = job_clean['Category'].str.upper().str.contains(key)
    job_clean = job_clean[~rare_cat]
    
    #remove row without salary
    no_salary = job_clean['Salary_Range'].str.contains('Salary undisclosed')
    df_salary = job_clean[~no_salary]
    df_no_salary = job_clean[no_salary]
    df_salary = df_salary.reset_index(drop=True)
    
    req_empty = []

    for i in range (len(df_salary)):
    
        if((len(df_salary['Requirements'][i]))<5):
            req_empty.append(i)
           
    #clean & remove row without requirements
    df_salary['Requirements']=df_salary['Requirements'].str.replace('(\n)', "")
    df_salary = df_salary.drop(req_empty)
    df_salary = df_salary.reset_index(drop=True)

    return df_salary

In [6]:
df = data_clean(df_raw)
df.shape

(458, 10)

## Feature Engineering

In [7]:
def salary_feature(df):
    
    #extract salary columns due to contain multiple information
    salary_range = df["Salary_Range"].str.split("to", n = 2, expand = True) 

    #Give columns name to the dataframe
    salary_range = salary_range.rename({0:'Min_Salary',1:'Max_Salary'}, axis='columns')

    #removed $ and , from salary 
    for col in salary_range.columns:
        salary_range[col]=salary_range[col].str.replace('(\$|,)', '')

    #convert from ojbect to float for statistical infomation
    salary_range['Min_Salary'] = salary_range['Min_Salary'].astype('float64')
    salary_range['Max_Salary'] = salary_range['Max_Salary'].astype('float64')
    
    #concat min_max salary dataframe with salary range dataframe
    df_salary1 = pd.concat([df, salary_range], axis=1)
    df_salary1 = df_salary1.drop(columns='Salary_Range')  
    
    #create a condition to check for high outliers
    abovemean_min = round(10*np.mean(df_salary1['Min_Salary']),0)
    abovemean_max = round(10*np.mean(df_salary1['Max_Salary']),0)
    
    #convert yearly salary into monthly salary

    df_salary1['Min_Salary'] = np.where((df_salary1['Min_Salary'] > abovemean_min),
                                    round((df_salary1['Min_Salary']/12),0), df_salary1['Min_Salary'])

    df_salary1['Max_Salary'] = np.where((df_salary1['Max_Salary'] > abovemean_min),
                                    round((df_salary1['Max_Salary']/12),0), df_salary1['Max_Salary'])
    
    #drop unrealistic min and max monthly salary range (which is more than 10 times)
    min_max_abnormal = (df_salary1['Max_Salary']>10*df_salary1['Min_Salary'])
    df_salary1 = df_salary1[~min_max_abnormal]
    
    #drop job with max salary less than 2500, assuming data entry/admin/operator job
    low_sal = ((df_salary1['Min_Salary']<=1800) | (df_salary1['Max_Salary']<=2500))
    df_salary1 = df_salary1[~low_sal]
    
    #create new feature for average salary
    df_salary1['Avg_Salary'] = (df_salary1['Min_Salary'] + df_salary1['Max_Salary']) / 2
    
    #drop job with outlier salary
    salary_outlier = ((df_salary1['Avg_Salary']>20000) | (df_salary1['Avg_Salary']<3000))
    df_salary1 = df_salary1[~salary_outlier]
    
    #bin salary into 4 groups:
    #3000 to 5500 - Low
    #5500 to 8000 - Med
    #8000 and above - High

    bins = [3000, 4500, 6000, np.inf]
    names = ['Low', 'Med', 'High']

    df_salary1['Salary_range'] = pd.cut(df_salary1['Avg_Salary'], bins, labels=names)
    df_salary1 = df_salary1.reset_index(drop=True)
    
    return df_salary1

In [8]:
df1 = salary_feature(df)
df1.shape

(457, 13)

In [9]:
def emp_type(df):

    
    #remove others employment type
    type_key = ['PART TIME','TEMPORARY','INTERNSHIP','FLEXI','FREELANCE']
    key = '|'.join(type_key)
    non_type = df['Emp_Type'].str.upper().str.contains(key)
    df = df[~non_type]
    df = df.reset_index(drop=True)

    #consolidate employment type
    consolidate = "(Full Time|Permanent, Full Time)"
    df['Emp_Type']=df['Emp_Type'].str.replace(consolidate, "Permanent")

    consolidate = "(Contract, Full Time)"
    df['Emp_Type']=df['Emp_Type'].str.replace(consolidate, "Contract")

    consolidate = "(Contract, Permanent, Full Time)"
    df['Emp_Type']=df['Emp_Type'].str.replace(consolidate, "Cont_Perm")
    
    return df

In [10]:
df1 = emp_type(df1)
df1.shape

(453, 13)

In [11]:
def seniority(df):
    
    #consolidate seniority from 9 groups to 4 groups
    
    df['Seniority'] = np.where((df['Seniority'] == 'Fresh/entry level') | (df['Seniority'] == 'Non-executive') | (df['Seniority'] == 'Junior Executive'),
                                 'Jr Executive', df['Seniority'])
    df['Seniority'] = np.where((df['Seniority'] == 'Executive') | (df['Seniority'] == 'Senior Executive'),
                                 'Sr Executive', df['Seniority'])
    df['Seniority'] = np.where((df['Seniority'] == 'Manager') | (df['Seniority'] == 'Middle Management') | (df['Seniority'] == 'Senior Management'),
                                 'Management', df['Seniority'])   
    return df

In [12]:
#df1 = seniority(df1)
#df1.shape

In [13]:
def cat_name(df):
    
    stacked = pd.DataFrame(df['Category'].str.split(',').tolist()).stack()
    cat_count = pd.DataFrame(stacked.value_counts(), columns=['Count']).reset_index()
    cat_count1 = []

    for i in range (len(cat_count)):
        cat_count1.append(cat_count['index'][i].lstrip())
    
    cat_name = list(dict.fromkeys(cat_count1))
    return cat_name

In [14]:
print(cat_name(df1))

['Laboratory ', 'R&D', 'Information Technology', 'Sciences ', 'Engineering', 'Finance', 'Banking ', 'Civil Service', 'Public ', 'Consulting', 'Telecommunications', 'Others', 'Manufacturing', 'Pharmaceutical', 'Healthcare ', 'Education ', 'Training', 'Insurance', 'General Management', 'Supply Chain', 'Logistics ', 'Risk Management']


In [15]:
def clean_kie(df):

    #extract only number from string
    df['Year_Experience'] = df['Year_Experience'].str.extract('(\d+)')
    
    #remove comma from cell with string
    clean_list = "(,|;|â||¦|®|)"
    for col in df.columns.difference(['Year_Experience','Min_Salary','Max_Salary','Avg_Salary']):
        df[col]=df[col].str.replace(clean_list, "")
        
    #remove extra whitespace between string
    df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    
    #fill NaN in year of experience with 0
    #df['Year_Experience'] = df['Year_Experience'].fillna(0)
    return df

In [16]:
df_clean = clean_kie(df1)
print(df_clean.shape)
df_clean.head(2)

(453, 13)


Unnamed: 0,Job_Id,Emp_Type,Job_Title,Company,Date_Posted,Year_Experience,Seniority,Category,Requirements,Min_Salary,Max_Salary,Avg_Salary,Salary_range
0,MCF-2020-0045240,Contract Permanent,Scientist (Machine Intellection) I2R,A*STAR RESEARCH ENTITIES,Posted 27 Mar 2020,,Professional,Sciences Laboratory R&D,Developing enhancing automating and managing a...,4500.0,9000.0,6750.0,High
1,MCF-2020-0045289,Contract Permanent,Scientist (Machine Intellection) I2R,A*STAR RESEARCH ENTITIES,Posted 27 Mar 2020,,Professional,Sciences Laboratory R&D,Protein nodes discovering in cancer Biomarker ...,4500.0,9000.0,6750.0,High


In [17]:
def job_cat(df):
    
    cat_list = ['Information Technology', 'Telecommunications', 'Engineering','Sciences', 'Finance',
                'Healthcare','Management','Consulting','Logistics', 'Civil', 'Others']
    
    for cat in cat_list:
        df[cat] = np.where(df['Category'].str.contains(cat),1,0)
    
    return df

In [18]:
def edu_cat(df):
    
    df['Requirements'] = df['Requirements'].str.lower()
    
    edu_list = ['phd','doctor','master','degree','computer science','engineering','statistic','math',
               'computer engineering','business','ph.d']
    
    for edu in edu_list:
        df[edu] = np.where(df['Requirements'].str.contains(edu),1,0)
    
    return df

In [19]:
def skill_cat(df):
    
    df['Requirements'] = df['Requirements'].str.lower()
    
    skills_list = [ 'python','java','scala','hadoop','sql','spark','tensorflow','scikit','linux','pytorch','theano','caffe'
                   ,'Matlab','perl','deep learning','nlp','apache','mapreduce','aws','azure','container','kafka','cassandra', 'c\++'
                   ,'julia','jupyter','nltk','tableau','power bi','sas','pandas','git','hive','impala','agile','machine learning','bash'
                   ,'natural language','oracle','cloud','flask','golang','optimization','c#','opencv','computer vision','api','jira'
                   ,'unix','bash','docker','keras', 'qlik','gcp','scrum', 'airflow','.net','d3.js'
                  ]
    
    for skill in skills_list:
        df[skill] = np.where(df['Requirements'].str.contains(skill),skill,0)
    
    return df, skills_list

In [20]:
df_clean = job_cat(df_clean)

In [21]:
df_clean, skills_list = skill_cat(df_clean)

In [22]:
def skill_extract(df, skills_llist):
    
    sum_elements = [f"df['{col}']" for col in skills_list]
    to_eval = "+ '_' + ".join(sum_elements)
    df['New_Skills'] = eval(to_eval)
    clean_list = "(0|_0|0_|_0_)"
    df['New_Skills']=df['New_Skills'].str.replace(clean_list, "")
    df['New_Skills']=df['New_Skills'].str.replace('_', '%')
    
    return df

In [25]:
df3 = skill_extract(df_clean, skills_list) # 504 ms

Unnamed: 0,Job_Id,Emp_Type,Job_Title,Company,Date_Posted,Year_Experience,Seniority,Category,Requirements,Min_Salary,...,unix,docker,keras,qlik,gcp,scrum,airflow,.net,d3.js,New_Skills
1,MCF-2020-0045289,Contract Permanent,Scientist (Machine Intellection) I2R,A*STAR RESEARCH ENTITIES,Posted 27 Mar 2020,,Professional,Sciences Laboratory R&D,protein nodes discovering in cancer biomarker ...,4500.0,...,0,0,0,0,0,0,0,0,0,python%java%tensorflow%pytorch%theano%caffe%pe...
2,MCF-2020-0045282,Contract Permanent,Scientist (Machine Intellection) I2R,A*STAR RESEARCH ENTITIES,Posted 27 Mar 2020,,Professional,Sciences Laboratory R&D,protein nodes discovering in cancer biomarker ...,4500.0,...,0,0,0,0,0,0,0,0,0,python%java%tensorflow%pytorch%theano%caffe%pe...
5,MCF-2020-0072360,Contract Permanent,Senior Research Engineer (Computer Science),NANYANG TECHNOLOGICAL UNIVERSITY,Posted 26 Mar 2020,5,Manager,Information Technology,data mining from massive data from documents t...,5500.0,...,0,0,0,0,0,0,0,0,0,python%c\++%machine learning
6,MCF-2020-0072378,Contract Permanent,Research Engineer (Deep Learning 2.0) I2R,A*STAR RESEARCH ENTITIES,Posted 26 Mar 2020,,Fresh/entry level,Sciences Laboratory R&D,minimum bachelor in computer science statistic...,3900.0,...,0,0,0,0,0,0,0,0,0,python%java%theano%caffe%perl%deep learning%c\...
7,MCF-2020-0071969,Contract Permanent,Senior Research Engineer (Computer Science),NANYANG TECHNOLOGICAL UNIVERSITY,Posted 26 Mar 2020,5,Manager,Information Technology,design and implement natural language processi...,5500.0,...,0,0,0,0,0,0,0,0,0,python%nlp%c\++%natural language
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,MCF-2020-0059189,Permanent,Software Engineer (Mapping / localization),ST ENGINEERING LAND SYSTEMS LTD.,Posted 10 Mar 2020,1,Professional,Information Technology,design and implement mapping localization and ...,4000.0,...,0,0,0,0,0,0,0,0,0,%linux%c\++%computer vision
440,MCF-2020-0058649,Permanent,Senior Software Engineer (C++ / Python),MANPOWER STAFFING SERVICES (SINGAPORE) PTE LTD,Posted 09 Mar 2020,5,Senior Executive,Information Technology,design real-time distributed applications for ...,4000.0,...,0,0,0,0,0,0,0,0,0,python%sql%linux%c\++%git%agile
444,MCF-2020-0052725,Permanent,Systems Engineer,NUTONOMY ASIA PTE. LTD.,Posted 03 Mar 2020,4,Executive,Information Technology,define model and simulate the behavior of the ...,7000.0,...,0,0,0,0,0,0,0,0,0,python%c\++%computer vision
446,MCF-2020-0050095,Permanent,DevOps Engineer,UCARE.IO PTE. LTD.,Posted 28 Feb 2020,2,Senior Executive,Information Technology,develop and lead the code deployment process. ...,4000.0,...,0,0,0,0,0,0,0,0,0,python%java%perl%c\++%git%cloud%c#%jira


In [27]:
#word count function
def word_count(df_col):

    str_counts = 0
    sum_str = 0

    for i in range (len(df_col)):    
        str_counts = len(df_col[i].split())
        sum_str = sum_str + str_counts

    print(sum_str)

In [28]:
#number of word found in Requirements column before clean
word_count(df_clean['Requirements'])

78225


In [29]:
def freq_words(word_count, features):

    num_word = np.asarray(word_count.sum(axis=0)).reshape(-1)
    most_count = num_word.argsort()[::-1]
    key_word = pd.Series(num_word[most_count], 
                           index=features[most_count])

    return key_word

In [30]:
def stop_word_fil(df):
    
    #stop words were added to filter some generic recurring business terms.
    stop = stopwords.words('english')
    stop += ['regret','shortlisted', 'candidates','notified','etc', 'take', 'hands','added','able','writting',
             'year','years','least', 'related','using', 'and', 'ability','work','skills','advantage','written'
            'develop','good','team','design','knowledge','experience','following','areas', 'ability','and','in','to']
    
    #most common words for requirements
    cvt = CountVectorizer(lowercase=True, strip_accents='unicode',max_features=80000, min_df=1, max_df=0.9,
                          stop_words=stop, ngram_range=(1,2))
    vect_word = cvt.fit_transform(df['Requirements'])
    features = np.array(cvt.get_feature_names()) 

    key_word = freq_words(vect_word, features)
    
    #update stop_word with common words
    new_stop = key_word[key_word<5].index
    stop.extend(new_stop)
    
    pat = r'\b(?:{})\b'.format('|'.join(stop))
    df['Requirements'] = df['Requirements'].str.replace(pat, " ")
    df['Requirements'] = df['Requirements'].map(lambda x: x.strip())
    df['Requirements'] = df['Requirements'].replace({' +':" "},regex=True)
    
    return df

In [61]:
#data_df = stop_word_fil(df_clean)
data_df = df_clean.copy()
#number of word found in Requirements column after clean
word_count(data_df['Requirements'])

78225


In [62]:
data_df.shape

(453, 82)

In [63]:
data_df = data_df.drop(columns=data_df[skills_list].columns)
data_df['New_Skills'] = data_df['New_Skills'].str.replace("\\", "")
data_df.shape

(453, 25)

In [34]:
#save output file skills_list

data_df.to_csv('{}JOB_DATA_v14.csv'.format(output_path), index=False, encoding='utf-8')

## Machine Learning

In [29]:
data_df = data_df.dropna(subset=['Year_Experience']).reset_index(drop=True)
data_df.shape

(352, 71)

In [30]:
#Dummified Seniority columns to use as predictor features
seniority_cat=data_df['Seniority'].str.get_dummies()
emp_cat=data_df['Emp_Type'].str.get_dummies()
df = pd.concat([data_df, seniority_cat, emp_cat], axis=1)
df.shape

(352, 83)

In [31]:
#CountVectorizer job requirements columns
#min_df=10, max_df=0.6, max_features=500, ngram_range=(1,2), f1 = 0.61
stop_ml = stopwords.words('english')

stop_ml += ['possess','work','back','ability','communication', 'participate', 'like', 'tools','distributed',
            'contribute','proven','engage','understanding','excellent', 'teams','experienced', 'familiarity',
            'partners', 'study', 'well','preferably','user','field','experience','english', 'level','sets',
            'delivery','implementation','relevant','state','exposure','record','problems','define','open',
            'proficient','understand']

cvec = CountVectorizer(lowercase=True, strip_accents='unicode',
                       max_features=500, min_df=10, max_df=0.6, 
                       stop_words=stop_ml,ngram_range=(1,2))
cvec.fit(df['Requirements'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.6, max_features=500, min_df=10,
                ngram_range=(1, 2), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [32]:
#creating predictor and target dataset
model_data = df.drop(columns=['Job_Title','Company','Seniority','Category','Min_Salary',
                               'Max_Salary','Emp_Type','Avg_Salary', 'Job_Id', 'Date_Posted'])

nlp = pd.DataFrame(cvec.transform(model_data['Requirements']).todense(),columns=cvec.get_feature_names())

senior_nlp = pd.concat([model_data, nlp], axis=1)

In [33]:
X = model_data.drop(columns=['Salary_range','Requirements'])
X_nlp = senior_nlp.drop(columns=['Salary_range','Requirements'])
y = senior_nlp['Salary_range'].values
X.shape

(352, 71)

In [34]:
#Data with dummified 'seniority' and countvectorized 'requirements'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
#Data with countvectorized 'requirements' only
X_train_nlp, X_test_nlp, y_train_nlp, y_test_nlp = train_test_split(X_nlp, y, test_size=0.3, random_state=42)

In [36]:
from sklearn.tree import DecisionTreeClassifier

In [37]:
dtc = DecisionTreeClassifier(max_depth=4, random_state=42)
dtc = dtc.fit(X_train , y_train)

dtc1 = DecisionTreeClassifier(max_depth=5, random_state=42)
dtc_nlp = dtc1.fit(X_nlp , y)

In [38]:
print(classification_report(y_test,dtc.predict(X_test),target_names=["Low", "Med", "High"]))

              precision    recall  f1-score   support

         Low       0.83      0.70      0.76        69
         Med       0.33      0.18      0.24        11
        High       0.43      0.69      0.53        26

    accuracy                           0.64       106
   macro avg       0.53      0.52      0.51       106
weighted avg       0.68      0.64      0.65       106



In [39]:
pd.DataFrame(confusion_matrix(y_test,dtc.predict(X_test)),
             index=['Actual Low','Actual Med', 'Actual High'],
             columns=['Pred Low','Pred Med','Pred High'])

Unnamed: 0,Pred Low,Pred Med,Pred High
Actual Low,48,4,17
Actual Med,2,2,7
Actual High,8,0,18


In [40]:
print(classification_report(y,dtc_nlp.predict(X_nlp),target_names=["Low", "Med", "High"]))

              precision    recall  f1-score   support

         Low       0.93      0.82      0.87       226
         Med       0.65      0.70      0.68        37
        High       0.58      0.73      0.64        89

    accuracy                           0.78       352
   macro avg       0.72      0.75      0.73       352
weighted avg       0.81      0.78      0.79       352



In [41]:
pd.DataFrame(confusion_matrix(y,dtc_nlp.predict(X_nlp)),
             index=['Actual Low','Actual Med', 'Actual High'],
             columns=['Pred Low','Pred Med','Pred High'])

Unnamed: 0,Pred Low,Pred Med,Pred High
Actual Low,185,3,38
Actual Med,1,26,10
Actual High,13,11,65


In [42]:
features = np.array(X_nlp.columns)
dt_coefs = pd.DataFrame({'coef':dtc_nlp.feature_importances_, 'abs coef':abs(dtc_nlp.feature_importances_)},index=features)
dt_coefs = dt_coefs.sort_values('coef',ascending=False)
dt_coefs.head(15)

Unnamed: 0,coef,abs coef
Year_Experience,0.405679,0.405679
Sciences,0.098422,0.098422
java scala,0.083142,0.083142
Non-executive,0.066195,0.066195
phd,0.061634,0.061634
python java,0.046882,0.046882
algorithms,0.043642,0.043642
science engineering,0.043628,0.043628
opencv,0.037369,0.037369
science computer,0.033332,0.033332


In [43]:
from sklearn.tree.export import export_text

tree_rules = export_text(dtc, feature_names=list(X_train))

In [44]:
from sklearn import tree

#clf = tree.DecisionTreeClassifier(max_leaf_nodes=n)
#clf_ = clf.fit(X, data_y)

feature_names = X_nlp.columns
class_name = dtc_nlp.classes_.astype(str)

def output_pdf(model):
    from sklearn import tree
    from sklearn.externals.six import StringIO
    import pydot_ng as pydot
    dot_data = StringIO()
    tree.export_graphviz(model, out_file=dot_data,
                         feature_names=feature_names,
                         class_names=class_name,
                         filled=True, rounded=True,
                         special_characters=True,
                          node_ids=1,)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("{}DT.pdf".format(output_path))

output_pdf(dtc_nlp)