In [1]:
import glob
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter

import missingno as msno
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, Imputer, FunctionTransformer
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LinearRegression, SGDRegressor, BayesianRidge, Lasso
from sklearn.base import BaseEstimator, TransformerMixin
from mlxtend.preprocessing import DenseTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [2]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, keys):
        self.keys = keys

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.keys]

In [3]:
# Load "data-prep.csv"
jobs_df = pd.read_csv("data-prep.csv")

# Drop "Unnamed: 0" column:
jobs_df = jobs_df.drop(columns="Unnamed: 0")

# Check loaded successfully:
jobs_df.head()
# - Should have 1167 columns

Unnamed: 0,details,experience,cat_construction,cat_consultant,cat_pharmaceutical,cat_media - journalism - newspaper,cat_facilities,cat_science,cat_health care,cat_sales,cat_manufacturing,cat_insurance,cat_qa - quality control,cat_banking,cat_accounting,cat_biotech,cat_telecommunications,cat_admin - clerical,cat_general labor,cat_installation - maint - repair,cat_business opportunity,cat_supply chain,cat_real estate,cat_nonprofit - social services,cat_human resources,cat_business development,cat_transportation,cat_skilled labor - trades,cat_nurse,cat_purchasing - procurement,cat_executive,cat_strategy - planning,cat_training,cat_education,cat_entry level,cat_other,cat_legal,cat_retail,cat_restaurant - food service,cat_hospitality - hotel,cat_inventory,cat_engineering,cat_finance,cat_government,cat_professional services,cat_information technology,cat_customer service,cat_legal admin,cat_automotive,cat_warehouse,cat_marketing,cat_design,cat_research,cat_management,cat_government - federal,cat_grocery,cat_general business,cat_distribution - shipping,indus_building materials,indus_semiconductor,indus_oil refining - petroleum - drilling,indus_printing - publishing,indus_consumer products,indus_mortgage,indus_other,indus_computer hardware,indus_employment - recruiting - staffing,indus_wireless,indus_government - civil service,indus_hotel - resort,indus_airline - aviation,indus_other great industries,indus_transportation,indus_environmental,indus_managed care,indus_social services,indus_credit - loan - collections,indus_hospitality,indus_travel,indus_automotive - motor vehicles - parts,indus_security,indus_restaurant,indus_food,indus_exercise - fitness,indus_advertising,indus_homebuilding,indus_construction,indus_public relations,indus_merchandising,indus_banking - financial services,indus_telecommunications,indus_newspaper,indus_art - photography - journalism,indus_securities,indus_internet - ecommerce,indus_fashion - apparel - textile,indus_retail,indus_insurance,indus_real estate - property mgt,indus_manufacturing,...,comp_amsted rail,comp_general dynamics information technology,comp_real staffing,comp_trinity system office,"comp_cgs administrators, llc",comp_biotelemetry,comp_accruepartners,comp_sodexo,comp_3coast,deg_high school,deg_doctorate,deg_4 year degree,deg_graduate degree,deg_none,deg_2 year degree,travel_road warrior,travel_none,travel_some,travel_up to 50%,travel_up to 25%,travel_negligible,state_TN,state_MD,state_UT,state_OR,state_PA,state_TX,state_MA,state_VA,state_OH,state_ME,state_DE,state_GU,state_MI,state_VI,state_MS,state_MO,state_MN,state_IL,state_GA,state_IN,state_KS,state_MT,state_ID,state_IA,state_KY,state_RI,state_CO,state_OK,state_AL,state_VT,state_PR,state_WA,state_CA,state_AK,state_AS,state_AR,state_CT,state_AZ,state_NH,state_NJ,state_NM,state_SC,state_WI,state_NC,state_ND,state_NE,state_LA,state_NY,state_HI,state_DC,state_WY,state_SD,state_FL,state_NV,state_WV,salary_min,salary_avg,salary_max,title_intern,title_lead,title_principal,title_chief,title_senior,title_junior,title_data,title_ml,title_database,title_ai,title_business,title_software,title_sql,title_financial,title_analyst,title_intelligence,title_engineer,title_developer,title_scientist,title_administrator,title_architect
0,"Senior Data Architect (AWS) Optomi, in partne...",11.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,130000.0,165000.0,200000.0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,Contract Only Job Summary: This position wi...,6.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,Azure Data Architect-8 Months-90% Remote-100-...,3.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200800.0,230920.0,261040.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,Ref ID: 03510-9501197370Classification: Datab...,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96384.0,108432.0,120480.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,Highly desired experience with managing and o...,13.5,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,150600.0,150600.0,150600.0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1


In [4]:
# Extract out rows without 'salary' into another dataframe:
no_salary_df = jobs_df[jobs_df["salary_min"].isnull()].reset_index(drop=True)
no_salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1718 entries, 0 to 1717
Columns: 1167 entries, details to title_architect
dtypes: float64(4), int64(1162), object(1)
memory usage: 15.3+ MB


In [5]:
jobs_df = jobs_df[jobs_df["salary_min"].notnull()].reset_index(drop=True)
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1089 entries, 0 to 1088
Columns: 1167 entries, details to title_architect
dtypes: float64(4), int64(1162), object(1)
memory usage: 9.7+ MB


In [6]:
# Which columns have null values?
jobs_df.columns[jobs_df.isna().any()].tolist()

# - Will try different methods of imputation in pipeline

['experience']

In [7]:
# Perform train-test split:
salary_columns = [column for column in jobs_df.columns.values if "salary" in column]
non_salary_columns = [column for column in jobs_df.columns.values if "salary" not in column]

X_train, X_test, y_train, y_test = train_test_split(jobs_df[non_salary_columns], jobs_df[salary_columns], test_size=0.3, random_state=42)

In [8]:
non_text_columns = [column for column in non_salary_columns if column != "details"]

In [9]:
# POSSIBLE PIPELINE PARAMETERS:
# Vectorizers: CountVectorizer(), TfidfVectorizer()
# Imputers: Imputer(), MICEImputer()
# Standardizers: StandardScaler(), MinMaxScaler()
# Dimensionality reducers: VarianceThreshold(), SelectKBest(), PCA(random_state=42)
# Regression estimators: 
# - LinearRegression(), 
# - BayesianRidge(),
# - Lasso(),
# - RandomForestRegressor(random_state=42),
# - MultiOutputRegressor(GradientBoostingRegressor(random_state=42), n_jobs=-1)
# - MultiOutputRegressor(AdaBoostRegressor(random_state=42), n_jobs=-1)
# - MultiOutputRegressor(SGDRegressor(random_state=42), n_jobs=-1)

# However in the interest of time, I'll just be forming pipelines of the combinations:
# 2 Vectorizers: CountVectorizer(), TfidfVectorizer()
# 1 Imputer: Imputer()
# 1 Standardizer: StandardScaler()
# 2 Dimensionality reducers: PCA(random_state=42), none
# 5 Regressors: 
# - LinearRegression(),
# - Lasso(),
# - RandomForestRegressor(random_state=42),
# - MultiOutputRegressor(GradientBoostingRegressor(random_state=42), n_jobs=-1)
# - MultiOutputRegressor(AdaBoostRegressor(random_state=42), n_jobs=-1)


# - Total of 2 * 2 * 5 = 20 pipelines
# - Will not gridsearch optimize model hyperparameters

reg_pipelines = []

vectorizers = [CountVectorizer(min_df = 0.01, max_df = 0.50), TfidfVectorizer(min_df = 0.01, max_df = 0.50)]
regressors =[
    RandomForestRegressor(random_state=42),
    MultiOutputRegressor(LinearRegression(), n_jobs=-1),
    MultiOutputRegressor(Lasso(), n_jobs=-1),
    MultiOutputRegressor(GradientBoostingRegressor(random_state=42), n_jobs=-1),
    MultiOutputRegressor(AdaBoostRegressor(random_state=42), n_jobs=-1)
]

# Gridsearch optimization to tune hyperparameters taking a long time, so will just iterate manually:
def reg_with_count_vec_and_pca(ngram_range, stop_words):
#     for each_vec in vectorizers:
        for each_reg in regressors:
            reg_pipelines.append(Pipeline([
                ('encode', FeatureUnion([
                        ('tokenize', Pipeline([
                        ('extract_text', ItemSelector(keys='details')),
                        ('vectorize', CountVectorizer(min_df = 0.01, max_df = 0.50, ngram_range=ngram_range, stop_words=stop_words)),
                        ('to_dense', DenseTransformer())
                    ])),
                ('combine', ItemSelector(keys=non_text_columns))
                ])),
                ('impute', Imputer()),
    #             ('standardize', StandardScaler()),
                ('reduce_dim', PCA(n_components=15, random_state=42)),
                ('estimate', each_reg)
            ]))
    
def reg_with_tfidf_vec_and_pca(ngram_range, stop_words):
#     for each_vec in vectorizers:
        for each_reg in regressors:
            reg_pipelines.append(Pipeline([
                ('encode', FeatureUnion([
                        ('tokenize', Pipeline([
                        ('extract_text', ItemSelector(keys='details')),
                        ('vectorize', TfidfVectorizer(min_df = 0.01, max_df = 0.50, ngram_range=ngram_range, stop_words=stop_words)),
                        ('to_dense', DenseTransformer())
                    ])),
                ('combine', ItemSelector(keys=non_text_columns))
                ])),
                ('impute', Imputer()),
    #             ('standardize', StandardScaler()),
                ('reduce_dim', PCA(n_components=15, random_state=42)),
                ('estimate', each_reg)
            ]))

def reg_with_count_vec_no_pca(ngram_range, stop_words):
#     for each_vec in vectorizers:
        for each_reg in regressors:
            reg_pipelines.append(Pipeline([
                ('encode', FeatureUnion([
                        ('tokenize', Pipeline([
                        ('extract_text', ItemSelector(keys='details')),
                        ('vectorize', CountVectorizer(min_df = 0.01, max_df = 0.50, ngram_range=ngram_range, stop_words=stop_words, max_features=2500)),
                        ('to_dense', DenseTransformer())
                    ])),
                ('combine', ItemSelector(keys=non_text_columns))
                ])),
                ('impute', Imputer()),
    #             ('standardize', StandardScaler()),
#                 ('reduce_dim', PCA(n_components=15, random_state=42)),
                ('estimate', each_reg)
            ]))
    
def reg_with_tfidf_vec_no_pca(ngram_range, stop_words):
#     for each_vec in vectorizers:
        for each_reg in regressors:
            reg_pipelines.append(Pipeline([
                ('encode', FeatureUnion([
                        ('tokenize', Pipeline([
                        ('extract_text', ItemSelector(keys='details')),
                        ('vectorize', TfidfVectorizer(min_df = 0.01, max_df = 0.50, ngram_range=ngram_range, stop_words=stop_words, max_features=2500)),
                        ('to_dense', DenseTransformer())
                    ])),
                ('combine', ItemSelector(keys=non_text_columns))
                ])),
                ('impute', Imputer()),
    #             ('standardize', StandardScaler()),
#                 ('reduce_dim', PCA(n_components=15, random_state=42)),
                ('estimate', each_reg)
            ]))


# param_grid={'encode__tokenize__vectorize__max_features': [None, 2500, 5000],
#             'encode__tokenize__vectorize__ngram_range': [(1, 1), (1, 2)],
#             'encode__tokenize__vectorize__stop_words': [None, 'english']}

In [None]:
reg_with_count_vec_and_pca(ngram_range=(1, 1), stop_words=None)
reg_with_count_vec_and_pca(ngram_range=(1, 2), stop_words=None)
reg_with_count_vec_and_pca(ngram_range=(1, 1), stop_words='english')
reg_with_count_vec_and_pca(ngram_range=(1, 2), stop_words='english')

reg_with_tfidf_vec_and_pca(ngram_range=(1, 1), stop_words=None)
reg_with_tfidf_vec_and_pca(ngram_range=(1, 2), stop_words=None)
reg_with_tfidf_vec_and_pca(ngram_range=(1, 1), stop_words='english')
reg_with_tfidf_vec_and_pca(ngram_range=(1, 2), stop_words='english')

reg_with_count_vec_no_pca(ngram_range=(1, 1), stop_words=None)
reg_with_count_vec_no_pca(ngram_range=(1, 2), stop_words=None)
reg_with_count_vec_no_pca(ngram_range=(1, 1), stop_words='english')
reg_with_count_vec_no_pca(ngram_range=(1, 2), stop_words='english')

reg_with_tfidf_vec_no_pca(ngram_range=(1, 1), stop_words=None)
reg_with_tfidf_vec_no_pca(ngram_range=(1, 2), stop_words=None)
reg_with_tfidf_vec_no_pca(ngram_range=(1, 1), stop_words='english')
reg_with_tfidf_vec_no_pca(ngram_range=(1, 2), stop_words='english')

len(reg_pipelines)

80

In [None]:
for index in range(len(reg_pipelines)):
    print("*** Pipe #{} ***".format(index))
    reg_pipelines[index].fit(X_train, np.array(y_train))
    print("Score: {}".format(reg_pipelines[index].score(X_test, np.array(y_test))))
    print("")

*** Pipe #0 ***
Score: 0.151325315007

*** Pipe #1 ***
Score: 0.222697003512

*** Pipe #2 ***
Score: 0.22270045105

*** Pipe #3 ***
Score: 0.220088488622

*** Pipe #4 ***
Score: 0.178375763239

*** Pipe #5 ***
Score: 0.143274642347

*** Pipe #6 ***
Score: 0.199560416175

*** Pipe #7 ***
Score: 0.199563116266

*** Pipe #8 ***
Score: 0.209628472021

*** Pipe #9 ***
Score: 0.16160339899

*** Pipe #10 ***
Score: 0.174131857509

*** Pipe #11 ***
Score: 0.216336908753

*** Pipe #12 ***
Score: 0.216341295969

*** Pipe #13 ***
Score: 0.120762741501

*** Pipe #14 ***
Score: 0.12580545743

*** Pipe #15 ***
Score: 0.170045965227

*** Pipe #16 ***
Score: 0.228737962052

*** Pipe #17 ***
Score: 0.228742338085

*** Pipe #18 ***
Score: 0.237634312284

*** Pipe #19 ***
Score: 0.158358011413

*** Pipe #20 ***
Score: 0.219504679431

*** Pipe #21 ***
Score: 0.237448668524

*** Pipe #22 ***
Score: 0.237487844952

*** Pipe #23 ***
Score: 0.243258104935

*** Pipe #24 ***
Score: 0.176724752327

*** Pipe #25 

Process PoolWorker-267:
Process PoolWorker-266:
Process PoolWorker-264:
Traceback (most recent call last):
  File "/Users/jasminetan/anaconda2/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
Traceback (most recent call last):
    self.run()
Traceback (most recent call last):
  File "/Users/jasminetan/anaconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
Process PoolWorker-265:
  File "/Users/jasminetan/anaconda2/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
Traceback (most recent call last):
  File "/Users/jasminetan/anaconda2/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/jasminetan/anaconda2/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
  File "/Users/jasminetan/anaconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
    self.run()
  File "/Users/jasminetan/anaconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*s

In [None]:
# reg_pipelines[2].fit(X_train, np.array(y_train))
# reg_pipelines[2].score(X_test, np.array(y_test))

In [None]:
# reg_pipelines[3].fit(X_train, np.array(y_train))
# reg_pipelines[3].score(X_test, np.array(y_test))

In [None]:
# reg_pipelines[4].fit(X_train, np.array(y_train))
# reg_pipelines[4].score(X_test, np.array(y_test))

In [None]:
# reg_pipelines[1].fit(X_train, np.array(y_train))
# reg_pipelines[1].score(X_test, np.array(y_test))

In [None]:
# reg_gridsearch_results = {
#     "best_estimator": [],
#     "best_score": [],
#     "best_params" : [],
#     "score_on_test": []
# }

# for each_pipe in reg_pipelines:
#     grid = GridSearchCV(each_pipe,  
#                         param_grid=param_grid,
#                         cv=2,
#                         n_jobs=-1,
#                         verbose=2)
#     grid.fit(X_train, y_train)
#     reg_gridsearch_results["best_estimator"].append(grid.best_estimator_)
#     reg_gridsearch_results["best_score"].append(grid.best_score_)
#     reg_gridsearch_results["best_params"].append(grid.best_params_)
#     reg_gridsearch_results["score_on_test"].append(grid.score(X_test, y_test))

In [None]:
# pd.DataFrame(reg_gridsearch_results)