In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor 
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

In [2]:
train = pd.read_csv('training.csv')
test = pd.read_csv('test_features.csv')

In [3]:
train.shape

(1000000, 10)

In [4]:
clean_df = train[train.salary > 0]
clean_df = clean_df.drop('jobId', axis = 1)
clean_df.shape

(999995, 9)

Building Data Frames for the models

In [5]:
feature_df = clean_df.drop('salary',axis = 1)
target = clean_df['salary']
job_id = test['jobId']
test = test.drop('jobId', axis = 1)

#### Building a ColumnTransformer
One Hot Encoding the categorical variables and passing the numerical values through.

In [6]:
transformer = make_column_transformer(
(OneHotEncoder(), ['companyId','jobType',
            'degree','major','industry']),
            remainder = 'passthrough')

#### Baseline Model

In [27]:
avg = np.mean(target)
print(F"The average salary is ${avg:.2f}k")
print()
constant = [50,100,int(avg),150,200]
for con in constant:
    lst = [con]*len(target)
    mae = mean_absolute_error(target, lst)
    print(F"The MAE predicting only ${con}k is {mae:.5f}")

The average salary is $116.06k

The MAE predicting only $50k is 66.61364
The MAE predicting only $100k is 32.91118
The MAE predicting only $116k is 31.00767
The MAE predicting only $150k is 43.14076
The MAE predicting only $200k is 84.69683


The baseline models which calculates the mean absolute error between the target and a constant value are no

### Linear Regression

In [9]:
model_lr = make_pipeline(transformer, LinearRegression())
mae_lr = cross_val_score(model_lr, feature_df, target, cv = 10, n_jobs = -1, scoring='neg_mean_absolute_error')
mean_mae_lr = -1 * np.mean(mae_lr)
cv_std_lr = np.std(mae_lr)
print(F"The 10-Fold alidation average MAE is ")
print('The {}-fold validation average MAE is {} and the standard devition is {}'.format(10, mean_mae_lr, cv_std_lr))

The 10-fold validation average MAE is 15.846171610897287 and the standard devition is 0.04764638962380082


In [46]:
def sampled_linear(df,target,transformer_col,cv,frac):
    """
    A function that uses cross validation on a linear regression model with different sample sizes of the dataframe. The Linear model is inside a pipeline with a column transformer that one hot encodes and passes through the numerical columns.

    df = dataframe
    target = target column
    transformer_col = the columns to be used in a column transformer
    cv = the number of folds for cross validation
    frac= the fraction of sample of the dataframe 
    
    """
    transformer = make_column_transformer((OneHotEncoder(),transformer_col),remainder = 'passthrough')
    sample = df.sample(frac = frac,random_state = 6)
    feature = sample.drop(target,axis = 1)
    target = sample[target]
    model = make_pipeline(transformer,LinearRegression())
    mae = cross_val_score(model, feature, target, cv = cv, n_jobs = -1,  scoring='neg_mean_absolute_error')
    mean_mae = -1*np.mean(mae)
    cv_std = np.std(mae)
    print(F"With a sample size of {frac*100}% and a {cv}-fold validation, the average MAE is {mean_mae:.5f} and the standard deviation is {cv_std:.5f}")

In [47]:
trans = ['companyId','jobType','degree','major','industry']
sampled_linear(clean_df, 'salary', trans, 10, 1)

With a sample size of 100% and a 10-fold validation, the average MAE is 15.84486 and the standard deviation is 0.03639


In [48]:
fraction = [.25,.5,.75,1]
for frac in fraction:
    sampled_linear(clean_df, 'salary', trans, 10, frac)

With a sample size of 25.0% and a 10-fold validation, the average MAE is 15.86691 and the standard deviation is 0.06285
With a sample size of 50.0% and a 10-fold validation, the average MAE is 15.83361 and the standard deviation is 0.05270
With a sample size of 75.0% and a 10-fold validation, the average MAE is 15.84573 and the standard deviation is 0.04286
With a sample size of 100% and a 10-fold validation, the average MAE is 15.84486 and the standard deviation is 0.03639


### Gradient Boosting

In [10]:
gbr = make_pipeline(transformer, GradientBoostingRegressor())
mae_gbr = cross_val_score(gbr, feature_df, target, cv = 5, n_jobs = -1, scoring = 'neg_mean_absolute_error')
mean_mae_gbr = -1 * np.mean(mae_gbr)
cv_std_gbr = np.std(mae_gbr)
print('The {}-fold validation average MAE is {} and the standard devition is {}'.format(5, mean_mae_gbr, cv_std_gbr))

The 5-fold validation average MAE is 15.65602765339402 and the standard devition is 0.038356460001813746


In [11]:
rand_gbr_pipe = make_pipeline(transformer, GradientBoostingRegressor())
param_grid = {
    'gradientboostingregressor__n_estimators': [50,75,100,150,200],
    'gradientboostingregressor__max_depth': [3,5,10,15,20],
    'gradientboostingregressor__learning_rate': [.01,.02,.04,.06,.08],  
}
rand_gbr = RandomizedSearchCV(rand_gbr_pipe, param_grid, cv = 2, n_iter = 10 ,n_jobs = -1, scoring='neg_mean_absolute_error')
rand_gbr.fit(feature_df, target)
rand_gbr_accuracy = rand_gbr.best_params_
rand_gbr_accuracy

KeyboardInterrupt: 