In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-science-job-salaries/ds_salaries.csv


In [2]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted
from category_encoders import OneHotEncoder
from sklearn.linear_model import Ridge  # noqa F401

import seaborn as sns

For this notebook, I will use Linear Regression applying on a categorical feature to predict salary

# Prepare Data
## 1. Import data

In [3]:
# IP1: Basic wrangle function
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)
    
    # Drop cols list
    drop_cols = []

    # Drop "Unnamed: 0" column (irrelevant), "salary" (as we will use salary in usd)
    drop_cols.append("Unnamed: 0")
    drop_cols.append("salary")

    # Drop columns
    df.drop(columns=drop_cols, inplace=True)
    
    return df

In [4]:
# IP2: Use wrangle function to read file
df = wrangle('../input/data-science-job-salaries/ds_salaries.csv')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,EUR,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,USD,260000,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,GBP,109024,GB,50,GB,M
3,2020,MI,FT,Product Data Analyst,USD,20000,HN,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,USD,150000,US,50,US,L


## 2. Explore

Let's explore a little bit about the data to see which features should we use for our model

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           607 non-null    int64 
 1   experience_level    607 non-null    object
 2   employment_type     607 non-null    object
 3   job_title           607 non-null    object
 4   salary_currency     607 non-null    object
 5   salary_in_usd       607 non-null    int64 
 6   employee_residence  607 non-null    object
 7   remote_ratio        607 non-null    int64 
 8   company_location    607 non-null    object
 9   company_size        607 non-null    object
dtypes: int64(3), object(7)
memory usage: 47.5+ KB


As the dataset is already cleaned. There's nothing much left to do.
Except for exploring the categorical features

In [6]:
df.select_dtypes("object").nunique()

experience_level       4
employment_type        4
job_title             50
salary_currency       17
employee_residence    57
company_location      50
company_size           3
dtype: int64

For this notebook, I will use the "experience_level" feature to predict salary

## 3. Split

In [7]:
target = "salary_in_usd"
features = ["experience_level"]
y_train = df[target]
X_train = df[features]

# Build Model
## 1. Baseline
So we have done preparing data. Now comes to the buidling model part.
The first step is to build a naive model to test the MAE of that.
After that, we will run our model and calculate its MAE as well, to see whether it beats the naive model

In [8]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
print("Mean apt salary: ", y_mean)
print("Baseline MAE: ", mean_absolute_error(y_train, y_pred_baseline))

Mean apt salary:  112297.86985172982
Baseline MAE:  52584.27064261269


Our naive model has predicted that the average salary (in USD) is around 112,297 USD. And the mean absolute error of the model is about 52,584 USD.

## 2. Iterate

Now, we need to make our model.

As we use categorical data, which LinearRegression can't processs. We need to transform it into numerical data using OneHotEncoder Transformer.

After that, as there are multiple categories in our feature, using Ridge instead of LinearRegression will be a safe bet.

In [9]:
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    Ridge()
)
model.fit(X_train, y_train)

Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['experience_level'], use_cat_names=True)),
                ('ridge', Ridge())])

## 3. Evaluate
We now need to evaluate the performance of this newly created model to see whether it beats our naive model.

In [10]:
y_pred_training = model.predict(X_train)
mae_training = mean_absolute_error(y_train, y_pred_training)
print("Training MAE: ", round(mae_training, 2))

Training MAE:  43589.48


Excellent, the MAE of our model is 43,589 while that of the naive model is 52,584. We beat it by nearly 10,000 USD.

# Communicate Results

In [11]:
# 1. We need to extract the intercept and coefficients of our model
intercept = model.named_steps["ridge"].intercept_.astype(float)
coefficients = model.named_steps["ridge"].coef_.astype(float)
print("intercept:", intercept)
print("list of coefficients:",coefficients)

intercept: 121381.32542574524
list of coefficients: [-33229.2631574   17174.62946901 -59066.79367939  75121.4273678 ]


In [12]:
# 2. Exporting features name
feature_names = model.named_steps["onehotencoder"].get_feature_names()
print(feature_names)

['experience_level_MI', 'experience_level_SE', 'experience_level_EN', 'experience_level_EX']


In [13]:
# 3. Projecting features name responding to coefficients
feat_imp = pd.Series(coefficients, index=feature_names)
feat_imp.head()

experience_level_MI   -33229.263157
experience_level_SE    17174.629469
experience_level_EN   -59066.793679
experience_level_EX    75121.427368
dtype: float64

In [14]:
# 4. Printing out final Linear Regression formula
print(f"salary (USD) = {intercept.round(2)}")
for f, c in feat_imp.items():
    print(f"+ ({round(c, 2)} * {f})")

salary (USD) = 121381.33
+ (-33229.26 * experience_level_MI)
+ (17174.63 * experience_level_SE)
+ (-59066.79 * experience_level_EN)
+ (75121.43 * experience_level_EX)
