In [51]:
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, OneHotEncoder, StandardScaler, LabelEncoder, RobustScaler, MinMaxScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

# from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

## Model Builidng

### Defining the Data Preprocessing pipeline 
1. Categorical Features
  * Impute missing values with Mode
  * One Hot Encoded
  
2. Numerical Features
  * Impute missing values with Median
  * Scaling with RobustScaler 
  
Note that eventhough our training data does not having missing values, there is no guarantee that real life data will be complete as well. By defining the default imputing strategy would be useful for handling edge cases, especially when the model is in a production environment.

In [2]:
df = pd.read_csv('data/glassdoor_job_cleaned2.csv')

In [10]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown="ignore"))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

In [21]:
# numerical = [i for i in df.select_dtypes(include=['number']).columns.values.tolist() if i not in ['min_salary','max_salary', 'avg_salary']]
# categorical = df.select_dtypes(include=['object', 'category']).columns.values.tolist()
target_label = ['avg_salary']

categorical = [
#  'Job Title', # high cardinality
#  'Salary Estimate', # dependant variable
#  'Job Description', # high cardinality
#  'Company Name', # high cardinality
#  'Location', # high cardinality
#  'Headquarters', # high cardinality
 'Size',
 'Type of ownership',
#  'Industry', # this is highly correlated to sector and has a high cardinality
 'Sector',
 'Revenue',
#  'company_txt', # high cardinality, parsed of company name
 'job_state',
 'job_simp', # derived from Job title
 'seniority']

numerical = [
 'Rating',
# 'Founded',
 'hourly',
 'employer_provided',
 'is_loc_hq',
 'comp_age', # from Founded
 'python_yn',
 'r_yn',
 'sas_yn',
 'excel_yn',
 'pp_yn',
 'sql_yn',
 'spark_yn',
 'aws_yn',
 'desc_len']

### Train/Validation Split

In [27]:
df_x = df[numerical + categorical]
df_y = df[target_label]

x_train, x_val, y_train, y_val = train_test_split(df_x, df_y, test_size=0.2, shuffle=True, random_state=42)

In [62]:
np.ravel(y_train).shape

(373,)

### Simple Model - Logistic Regression

In [34]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical),
        ('num', numeric_transformer, numerical),
    ],
    remainder='drop',
)

In [52]:
# make pipeline for linear regression
pipe_reg = make_pipeline(
    preprocessor,
    LinearRegression()
)

# make pipeline for RF
pipe_rf = make_pipeline(
    preprocessor,
    RandomForestRegressor()
)

In [63]:
model_reg = pipe_reg.fit(x_train, y_train)
model_rf = pipe_rf.fit(x_train, np.ravel(y_train))

In [64]:
# predict the target on train and test data 
reg_pred_train = model_reg.predict(x_train)
reg_pred_val  = model_reg.predict(x_val)

# Root Mean Squared Error on train and val data
print('RMSE on train data: ', mean_squared_error(y_train, reg_pred_train)**(0.5))
print('RMSE on val data: ',  mean_squared_error(y_val, reg_pred_val)**(0.5))

RMSE on train data:  28.280581353151963
RMSE on val data:  45.033430823692356


In [65]:
# predict the target on train and test data 
rf_pred_train = model_rf.predict(x_train)
rf_pred_val  = model_rf.predict(x_val)

# Root Mean Squared Error on train and val data
print('RMSE on train data: ', mean_squared_error(y_train, rf_pred_train)**(0.5))
print('RMSE on val data: ',  mean_squared_error(y_val, rf_pred_val)**(0.5))

RMSE on train data:  14.018554355561745
RMSE on val data:  49.9013186651543


In [50]:
np.mean(cross_val_score(model_reg, x_train, y_train, scoring = 'neg_mean_absolute_error', cv = 3))

-32.564728211839565

In [41]:
accuracy_score = pipe_reg.score(x_val, y_val)
print(accuracy_score)

0.3780281977737837


In [44]:
type(pipe_reg)

sklearn.pipeline.Pipeline