In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV 

%matplotlib inline
pd.options.display.max_columns = 50

### Raw Data

In [2]:
data = pd.read_csv('USA_Job_Indeed_v4.csv')

In [3]:
data.head()

Unnamed: 0,jobId,stateProvince,companyId,city,avgOverallRating,numReviews,normTitle,normTitleCategory,descriptionCharacterLength,descriptionWordCount,experienceRequired,estimatedSalary,supervisingJob,licenseRequiredJob,educationRequirement,postwday,averageWordLength,totalJobAgeDays,sumClicks,sumLocalClicks,sumLocalClickRate,WeightAverageEmploymentRate,WeightAverageIndex,postMonth,class,maxIncreaseRate,minIncreaseRate
0,job0000000,TX,company00000,Dallas-Fort Worth,0.0,0,home health aide,mednurse,834,165,2.0,69300.0,0.0,1.0,,Sun,5.054545,99,37,4,0.108108,4.124521,-1.253637,6,2,-0.333333,-0.416667
1,job0000001,MA,company00001,Boston,4.0,551,senior food service worker,food,3297,566,0.0,22700.0,0.0,0.0,High school,Wed,5.825088,97,54,17,0.314815,4.551215,-0.266086,6,2,0.928571,-0.777778
2,job0000002,FL,company00002,Melbourne,0.0,0,field service engineer,install,2465,373,5.0,66000.0,0.0,0.0,Higher education,Fri,6.608579,95,31,14,0.451613,4.906892,-0.798484,7,2,2.428571,2.428571
3,job0000003,MA,company00003,Plymouth,3.8,73,cashier,retail,5310,828,1.0,14900.0,0.0,0.0,High school,Sat,6.413043,96,56,40,0.714286,4.551215,-0.266086,7,2,1.0,-0.608696
4,job0000005,NV,company00005,Las Vegas,3.9,114,nursing student,mednurse,2918,458,3.0,51800.0,0.0,1.0,Higher education,Tue,6.371179,98,28,3,0.107143,5.936776,-1.360961,7,2,0.8,0.8


In [4]:
state_dummies = pd.get_dummies(data['stateProvince'])
title_dummies = pd.get_dummies(data['normTitleCategory'])
education_dummies = pd.get_dummies(data['educationRequirement'])
postwday_dummies = pd.get_dummies(data['postwday'])
postMonth_dummies = pd.get_dummies(data['postMonth'])

In [41]:
estimatedSalary_log = np.log(data['estimatedSalary']+0.0001)
numReviews_log = np.log(data['numReviews']+0.0001)
descriptionCharacterLength_log = np.log(data['descriptionCharacterLength']+0.0001)

In [51]:
selected_data = data[['avgOverallRating', 'experienceRequired','WeightAverageIndex', 'maxIncreaseRate', 'minIncreaseRate', 'class']]

In [52]:
whole_train = pd.concat([selected_data, descriptionCharacterLength_log, estimatedSalary_log, numReviews_log, title_dummies, education_dummies, postMonth_dummies], axis=1)     

#### get the class

In [53]:
X_train, X_test = train_test_split(whole_train, test_size=0.2, random_state=4990)

In [54]:
train = X_train.loc[X_train['class'].isin([1, 4])]

In [55]:
train.head()

Unnamed: 0,avgOverallRating,experienceRequired,WeightAverageIndex,maxIncreaseRate,minIncreaseRate,class,descriptionCharacterLength,estimatedSalary,numReviews,accounting,admin,agriculture,arch,arts,aviation,care,childcare,construction,customer,driver,education,engchem,engcivil,engelectric,engid,...,sports,tech,techhelp,techinfo,techsoftware,therapy,transport,uncategorized,veterinary,warehouse,0,High school,Higher education,1,2,3,4,5,6,7,8,9,10,11,12
282842,0.0,0.0,-1.253637,11.5,-0.928571,1,8.326517,10.098232,-9.21034,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
10482,0.0,3.0,-1.010264,3.5,-0.777778,1,7.612337,11.239804,-9.21034,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
294212,0.0,5.0,-0.861379,0.332491,-0.833333,4,7.74457,9.883285,-9.21034,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
266889,4.0,4.0,3.338029,5.333333,-0.892857,1,8.18172,11.521885,4.276668,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
251592,4.0,0.5,-1.253637,4.166667,-0.84375,1,7.641084,9.820106,4.007335,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


### Add standardization

In [101]:
# from sklearn.preprocessing import StandardScaler
# train_std = StandardScaler().fit_transform(train.dropna())
# train = pd.DataFrame(train_std, columns=train.columns)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn import preprocessing

In [47]:
train_X = train.drop(['class'], axis = 1)
train_X.dropna(inplace=True)
train_Y = train.dropna()['class'].values

test = X_test.loc[X_test['class'].isin([1, 4])]
test_X = test.drop(['class'], axis = 1)
test_X.dropna(inplace=True)
test_Y = test.dropna()['class'].values

train_X_sc = preprocessing.scale(train_X)
test_X_sc = preprocessing.scale(test_X)

### Grid Search for parameter

In [15]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [16]:
from sklearn.linear_model import SGDClassifier

In [49]:
alpha_list = np.arange(0.01,0.05,0.005)
clf__l1_ratio_list = np.arange(0.2,0.35,0.02)

In [50]:
# this are the parameters that we want the grid search to scan through
params = {'clf__alpha': alpha_list, 
          'clf__l1_ratio':clf__l1_ratio_list} # the format is: 'clf_' + parameter

# use grid search cross-validation to fin the best parameters
model = GridSearchCV(estimator = SGDClassifier(loss="log", penalty="elasticnet"), 
                     param_grid = dict(alpha = alpha_list, l1_ratio = clf__l1_ratio_list),
                     n_jobs=-1, cv=5, verbose=1, scoring='accuracy') 
# start fit
model.fit(train_X_sc, train_Y)


Fitting 5 folds for each of 64 candidates, totalling 320 fits


KeyboardInterrupt: 

In [35]:
print(model.best_score_)
print(model.best_params_)

0.7611518857783561
{'alpha': 0.001, 'l1_ratio': 0.29999999999999993}


In [36]:
bestParam = model.best_params_

In [37]:
bestModel = SGDClassifier(loss="log", penalty="elasticnet", alpha = bestParam['alpha'], l1_ratio = bestParam['l1_ratio'])
bestModel.fit(train_X, train_Y)

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.29999999999999993,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [38]:
bestModel.coef_

array([[ 1.49454073e+01,  0.00000000e+00, -1.40450860e+01,
        -8.49161215e+00, -1.85403023e+02,  1.16280142e+01,
         6.52743097e+01, -6.39341901e+00,  1.81942722e+00,
         1.19009893e+01,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.43693932e-01,
         3.95328860e+00,  0.00000000e+00,  3.56409345e+00,
         0.00000000e+00, -7.43976528e-01,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -3.79711222e-01,
         0.00000000e+00,  0.00000000e+00, -2.16349453e+00,
         0.00000000e+00,  9.13734393e-01, -2.59842530e+00,
         0.00000000e+00,  0.00000000e+00, -1.45756991e+00,
         5.55697064e-01,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00, -9.03367862e-02,  0.00000000e+00,
         7.11467047e-01, -4.31826318e+00, -4.45217604e-01,
         0.00000000e+00,  0.00000000e+00, -1.96980378e-01,
         0.00000000e+00,  0.00000000e+00,  4.98167816e-01,
         0.00000000e+00, -9.44079065e-01, -2.46590460e+0

In [39]:
print(list(train_X)[0])
print(list(train_X)[2])
print(list(train_X)[3])
print(list(train_X)[4])
print(list(train_X)[5])
print(list(train_X)[6])
print(list(train_X)[-6])
print(list(train_X)[-1])

avgOverallRating
experienceRequired
WeightAverageIndex
maxIncreaseRate
minIncreaseRate
estimatedSalary
7
12


In [160]:
list(train_X)

['avgOverallRating',
 'descriptionCharacterLength',
 'experienceRequired',
 'WeightAverageIndex',
 'maxIncreaseRate',
 'estimatedSalary',
 'numReviews',
 'accounting',
 'admin',
 'agriculture',
 'arch',
 'arts',
 'aviation',
 'care',
 'childcare',
 'construction',
 'customer',
 'driver',
 'education',
 'engchem',
 'engcivil',
 'engelectric',
 'engid',
 'engmech',
 'finance',
 'food',
 'hospitality',
 'hr',
 'install',
 'insurance',
 'legal',
 'management',
 'manufacturing',
 'marketing',
 'math',
 'meddental',
 'meddr',
 'media',
 'medinfo',
 'mednurse',
 'medtech',
 'military',
 'mining',
 'personal',
 'pharmacy',
 'project',
 'protective',
 'realestate',
 'retail',
 'sales',
 'sanitation',
 'science',
 'service',
 'socialscience',
 'sports',
 'tech',
 'techhelp',
 'techinfo',
 'techsoftware',
 'therapy',
 'transport',
 'uncategorized',
 'veterinary',
 'warehouse',
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12]

In [40]:
bestModel.score(test_X_sc, test_Y)

0.4884323523968166

In [None]:
trainFinal_X = 

In [None]:
logit = SGDClassifier(loss="log", penalty="elasticnet")
logit.fit(trainFinal_X, train_Y)