In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Comparing Random Forest and Keras Models on a Categorical Dataset with Continuous Target Variable

## Introduction

In this project, we are comparing the initial performance of a Random Forest model to that of a Keras neural network. We will evaluate the performance of both models using common regression metrics such as the Root Mean Squared Error (RMSE), R-squared, and Out-Of-Bag (OOB) score. By comparing the performance of these models, we aim to determine which one is better suited for the given task and potentially identify areas for improvement in both models.

## Dataset
This dataset is sourced from listings on ai-jobs.net and was updated within a month of this notebook's creation and can be found at https://www.kaggle.com/datasets/arnabchaki/data-science-salaries-2023

In [None]:
salaries_df = pd.read_csv('../input/data-science-salaries-2023/ds_salaries.csv')
salaries_df.tail()

## Dropping salary and salary_currency 
We can use the salary_in_usd instead of these two since they are not standardized for all data points 

In [None]:
salaries_df = salaries_df.drop('salary', axis=1)
salaries_df = salaries_df.drop('salary_currency', axis=1)

## Converting categorical data into dummy coding format for model

To use categorical data in our regression model, we need to convert it into numerical format. One way to do this is by using dummy coding. 

We will create dummy variables for the following categorical features:
- `job_title`
- `employment_type`
- `employee_residence`
- `company_location`
- `company_size`
- `experience_level`

Dummy coding involves creating a new binary column for each unique value in a categorical feature. If a row belongs to a certain category, the corresponding binary column will have a value of 1, otherwise it will have a value of 0. 

We will use the `pandas` `get_dummies` function to create our dummy variables. 




In [None]:
dummy_df = pd.get_dummies(salaries_df[['experience_level', 'employment_type', 'job_title','employee_residence','company_location','company_size']], prefix=['exp','time', 'job','resd','loc','size'])
dummy_df.head()

## Clearing catagorial features
Now that we have dummy coded our features we do not need them in the dataset 

In [None]:
filtered_salaries_df = pd.concat([salaries_df, dummy_df], axis=1)
filtered_salaries_df = filtered_salaries_df.drop(['experience_level', 'employment_type', 'job_title','employee_residence','company_location','company_size'], axis=1)
filtered_salaries_df.columns

# Random forest model 
Then, we train a random forest regressor model with 100 estimators and an out-of-bag score of True, which will enable us to have another metric of performance outside of RMSE and R score 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(filtered_salaries_df.drop('salary_in_usd', axis=1), filtered_salaries_df['salary_in_usd'], random_state=1)

# Train a random forest regressor model
n_estimators = 800  # number of trees in the forest
max_depth = 300   # maximum depth of each decision tree
min_samples_split = 2 # minimum number of samples required to split an internal node
min_samples_leaf = 1   # minimum number of samples required to be at a leaf node

# Create the Random Forest model with the desired hyperparameters
rf = RandomForestRegressor(
    n_estimators=n_estimators,
    oob_score=True,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=42
)
rf.fit(X_train, y_train)

# Get feature importances
importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})
importances = importances.sort_values('Importance', ascending=False)

# Print the top 10 most important features
print(importances.head(30))

print(f"OOB score:", rf.oob_score_)

# Predict salaries on the test set and calculate RMSE and R-squared
predictions = rf.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
r_squared = r2_score(y_test, predictions)
print('RMSE:', rmse)
print('R-squared:', r_squared)

# Performance

## OOB Score:

The out-of-bag (OOB) score is an estimate of the model's performance on unseen data. It is calculated by randomly selecting a subset of the training data to build each decision tree in the random forest model. The remaining data that is not used to build the tree is called the "out-of-bag" data. The model can then be evaluated on this out-of-bag data to estimate its performance on unseen data. A higher OOB score indicates better performance of the model on unseen data.

## Root Mean Squared Error (RMSE):

The root mean squared error (RMSE) is a measure of the difference between the predicted and actual values of the target variable. It is calculated as the square root of the average of the squared differences between the predicted and actual values. RMSE is often used as a metric for regression models. A lower RMSE score indicates better performance of the model in predicting the target variable.

## R-squared:

The R-squared score (also known as the coefficient of determination) is a statistical measure that represents the proportion of variance in the dependent variable that is explained by the independent variables in the model. It ranges from 0 to 1, where 0 means the model does not explain any variance in the dependent variable, and 1 means the model explains all the variance in the dependent variable. R-squared is often used as a metric to evaluate the goodness of fit of regression models. A higher R-squared score indicates better performance of the model in explaining the variance in the dependent variable.

## Summary:

Based on the scores, we can evaluate the performance of the random forest model as follows:

- OOB Score: The model has an OOB score of 0.410685359490081, indicating that it may not generalize well to unseen data.

- RMSE: The RMSE score of the model is 0.40192550900112234, which means that, on average, the model's predictions are off by about $50,663.29. This score can be considered moderate to high, depending on the context.

- R-squared: The R-squared score of the model is 0.37415958375830816, indicating that the model explains about 37.4% of the variance in the target variable. This score can be considered moderate, as it suggests that the model is not able to explain a large portion of the variance in the target variable.

It was found that changing the shown hyperparameters did not make much of a difference in performance even at extreme differences in tuning. Overall, while the model has some predictive power, it may not generalize well to unseen data in this cirucmstance, and there is still room for improvement in terms of reducing the error and increasing the amount of variance in the target variable explained by the model.

# Keras Neural Network
The model has four layers of dense neural networks with varying numbers of neurons and activation functions. It also includes dropout regularization to reduce overfitting. The model is compiled with mean squared error as the loss function and Adam as the optimizer. The model is then trained on training data and validated on testing data for 500 epochs with a batch size of 20. Finally, the root mean squared error is evaluated on the testing data and displayed. A learning curve plot of the training and validation loss is also generated using the history object obtained from training the model.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras import layers

# Define the model architecture
model = Sequential()
model.add(Dense(128, input_shape=(256,), activation='relu'))
layers.Dropout(rate=0.2)
model.add(Dense(64, activation='sigmoid'))
layers.Dropout(rate=0.2)
model.add(Dense(1, activation = 'linear'))


# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

#test and visualuze
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=500,
    epochs=500,
)

rmse = model.evaluate(X_test, y_test, verbose=0)
print('RMSE:', rmse)


# Show the learning curves
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();


## Performance 
It seems that dummy encodings for all categorical variables is not an optimal solution to having categorical data, possible improvments could be including ordinal encoding for ordered features like experience level and company size. Upon review it seems that currency a person is paid in correlates to different salaries as well, so that has been added back into the dataset for further analysis. 

# Improving performance via feature engineering 
- dummy coding currency type
- ordinal coding company size and employee experience

In [None]:
salaries_df = pd.read_csv('../input/data-science-salaries-2023/ds_salaries.csv')

#dropping salary feature but keeping currency 
salaries_df = salaries_df.drop('salary', axis=1)

dummy_df = pd.get_dummies(salaries_df[['employment_type', 'job_title','employee_residence','company_location','salary_currency']], prefix=['time', 'job','resd','loc','curr'])
filtered_salaries_df = pd.concat([salaries_df, dummy_df], axis=1)

filtered_salaries_df = filtered_salaries_df.drop([ 'employment_type', 'job_title','employee_residence','company_location','salary_currency'], axis=1)
filtered_salaries_df.columns

## Ordinal encoding 
Here we are going to ordinal encode ordered features of company size and experience level to create a better dataset for the models 

In [None]:
from sklearn.preprocessing import OrdinalEncoder


encoder = OrdinalEncoder(categories=[['S', 'M', 'L']])
encoder.fit(filtered_salaries_df[['company_size']])
filtered_salaries_df[['size_encoded']] = encoder.transform(filtered_salaries_df[['company_size']])
filtered_salaries_df = filtered_salaries_df.drop('company_size', axis=1)
filtered_salaries_df.head()

In [None]:
encoder = OrdinalEncoder(categories=[['EN', 'MI', 'SE','EX']])
encoder.fit(filtered_salaries_df[['experience_level']])
filtered_salaries_df[['exp_encoded']] = encoder.transform(filtered_salaries_df[['experience_level']])
filtered_salaries_df = filtered_salaries_df.drop('experience_level', axis=1)
filtered_salaries_df.head()

## Random Forest Model 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(filtered_salaries_df.drop('salary_in_usd', axis=1), filtered_salaries_df['salary_in_usd'], random_state=1)

# Train a random forest regressor model
n_estimators = 2000  # number of trees in the forest
max_depth = 300   # maximum depth of each decision tree
min_samples_split = 10 # minimum number of samples required to split an internal node
min_samples_leaf = 3   # minimum number of samples required to be at a leaf node

# Create the Random Forest model with the desired hyperparameters
rf = RandomForestRegressor(
    n_estimators=n_estimators,
    oob_score=True,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=42
)
rf.fit(X_train, y_train)

# Get feature importances
importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})
importances = importances.sort_values('Importance', ascending=False)

# Print the top 10 most important features
print(importances.head(30))

print(f"OOB score:", rf.oob_score_)

# Predict salaries on the test set and calculate RMSE and R-squared
predictions = rf.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
r_squared = r2_score(y_test, predictions)
print('RMSE:', rmse)
print('R-squared:', r_squared)

# Keras model 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras import layers

# Define the model architecture
model = Sequential()
model.add(Dense(400, input_shape=(271,), activation='relu'))
layers.Dropout(rate=0.2)
model.add(Dense(200, activation='relu'))
layers.Dropout(rate=0.2)
model.add(Dense(100, activation='relu'))
layers.Dropout(rate=0.2)
model.add(Dense(50, activation='relu'))
layers.Dropout(rate=0.2)
model.add(Dense(10, activation='relu'))
layers.Dropout(rate=0.2)
model.add(Dense(1, activation = 'linear'))


# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

#test and visualuze
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=20,
    epochs=100,
)

rmse = model.evaluate(X_test, y_test, verbose=0)
print('RMSE:', rmse)


# Show the learning curves
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();


# Conclusion
Based on these experiments, I found that random forest preforms well compared to a keras neural network on this data and that using different encoding methods for categorical features did not significantly affect the performance of Random Forest and Keras models on our dataset. However, it's important to note that the effectiveness of different encoding methods may vary depending on the specific dataset and the type of models used. In general, it's always a good idea to experiment with different encoding methods and model architectures to find the combination that works best for a given problem.