In [182]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn_pandas import DataFrameMapper
import sklearn.preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import string

In [183]:
jobs = pd.read_csv('./datasets/jobsclean.csv')
jobs.drop([jobs.columns[0]], axis=1, inplace=True)

In [184]:
jobs.drop(jobs[jobs['salary_'].isnull()].index,inplace=True)
jobs['salary_']= jobs['salary_'].astype(float)

In [185]:
jobs.loc[jobs['experience_'].isnull(), 'experience_'] = jobs.experience_.median()

In [186]:
dummy = pd.get_dummies(jobs['state'], drop_first=True, prefix = 'state')
jobs.drop(['state'],axis = 1,inplace=True)
jobs = pd.concat([jobs, dummy],axis=1)

In [187]:
dummy = pd.get_dummies(jobs['company'], drop_first=True, prefix='coy')
jobs.drop('company',axis=1,inplace=True)
jobs = pd.concat([jobs, dummy],axis=1)

In [188]:
pd.value_counts(jobs.title)

data engineer            231
ml engineer              196
data architect           189
business analyst         146
business intelligence    130
stats                    109
stats programmer          79
data scientist            59
data analyst              44
ml scientist              43
data developer            15
data admin                 7
ai scientist               3
Name: title, dtype: int64

In [190]:
jobs.rename(columns ={'title': 'TITLE'}, inplace =True)

In [191]:
jobs.loc[jobs['TITLE']!='data scientist', 'TITLE'] = 0
jobs.loc[jobs['TITLE']=='data scientist', 'TITLE'] = 1

In [193]:
from sklearn.utils import resample

# Separate majority and minority classes
majority = jobs[jobs['TITLE']==0]
minority = jobs[jobs['TITLE']==1]
 
# Upsample minority class
minority_upsampled = resample(minority, replace=True,n_samples=1192,random_state=0) # reproducible results
 
# Combine majority class with upsampled minority class
jobs_upsampled = pd.concat([majority, minority_upsampled])

In [195]:
jobs_upsampled.TITLE.value_counts()

1    1192
0    1192
Name: TITLE, dtype: int64

In [196]:
stop = stopwords.words('english')
jobs_upsampled['details'] = jobs_upsampled['details'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [197]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2),min_df=5)
X=vectorizer.fit_transform(list(jobs_upsampled['details']))
counts=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
jobs_upsampled.drop('details',axis=1,inplace=True)

In [198]:
counts.transpose().sort_values(0, ascending=False).transpose().head(10)

Unnamed: 0,data,data engineer,big data,practices,big,help,investment,emerging,optimize,multiple,...,experience following,experience financial,experience finance,experience field,experience fast,experience extracting,experience extensive,experience exposure,experience expertise,zookeeper plus
0,0.361145,0.143248,0.141828,0.134581,0.128244,0.118214,0.103249,0.098092,0.087238,0.086023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.207489,0.126194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.151579,0.035457,0.0,0.024984,0.0,0.0,0.0,0.0,0.0,0.021293,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.215725,0.093716,0.301559,0.0,0.272677,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047605
4,0.216153,0.131463,0.0,0.030877,0.0,0.027122,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.153409,0.388761,0.057736,0.0,0.052206,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.209683,0.091091,0.0,0.0,0.0,0.02819,0.049242,0.0,0.0,0.027351,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.216285,0.087695,0.260478,0.061792,0.264972,0.0,0.0,0.0,0.0,0.052662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.275255,0.20926,0.103593,0.0,0.093671,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.197574,0.05462,0.0,0.076973,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [199]:
jobs_upsampled.reset_index(drop=True, inplace=True) 

In [200]:
jobs_upsampled =pd.concat([jobs_upsampled, counts],axis=1)

In [201]:
jobs_upsampled.head()

Unnamed: 0,TITLE,experience_,salary_,state_AR,state_AZ,state_CA,state_CO,state_CT,state_DC,state_FL,...,youwhat youll,yrs,yrs experience,zero,zero debt,zip,zip code,zones,zookeeper,zookeeper plus
0,0,5.0,115000.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,6.0,175000.0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,3.0,576045.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.054034,0.056355,0.0,0.0,0.0
3,0,5.0,185000.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047605,0.047605
4,0,5.0,115000.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [202]:
from sklearn.model_selection import train_test_split

predictors = list(jobs_upsampled.columns)
predictors.remove('TITLE')

# X is a matrix, hence we use [] to access the features we want in feature_cols
X = jobs_upsampled[predictors]

# y is a vector, hence we use dot to access 'label'
y = jobs_upsampled.TITLE.values

# 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

In [203]:
X.shape

(2384, 20101)

In [204]:
y.shape

(2384,)

In [208]:
# Standard Scaling
from sklearn.preprocessing import StandardScaler

# Initialize the scaler.
ss = StandardScaler()

# Fit the data using the scaler (scale the data).
Xs = ss.fit_transform(X_train.values)
Xst = ss.fit_transform(X_test.values)

In [None]:
from sklearn.linear_model import LogisticRegressionCV

lrcv = LogisticRegressionCV(penalty='l1', Cs=100, cv=5, solver='liblinear', scoring = 'accuracy')
lrcv.fit(Xs, y_train)

In [None]:
model = lr.fit(Xs, y_train)
predictions = model.predict(Xst)
y_pp = model.predict_proba(Xst)