In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv('hf://datasets/nprak26/remote-worker-productivity/remote_work_productivity.csv')
df

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,Employee_ID,Age,Years_Experience,WFH_Days_Per_Week,Gender,Education_Level,Marital_Status,Has_Children,Location_Type,Department,...,Quality_Score,Innovation_Score,Efficiency_Rating,Meetings_Per_Week,Commute_Time_Minutes,Job_Satisfaction,Stress_Level,Work_Life_Balance,Survey_Date,Response_Quality
0,EMP0001,39,10,2,Female,Associate Degree,Married,Yes,Urban,Product,...,58.1,52.1,72.1,4,48,55.9,6,8,2024-04-05,Medium
1,EMP0002,33,4,5,Female,Master Degree,Married,No,Urban,Customer Success,...,93.3,77.9,89.5,12,0,96.1,3,8,2024-01-29,High
2,EMP0003,40,3,3,Male,PhD,Single,Yes,Rural,Operations,...,84.7,63.2,95.0,15,24,90.4,5,6,2024-01-18,High
3,EMP0004,48,14,3,Male,Bachelor Degree,Married,Yes,Urban,Finance,...,67.8,82.5,95.0,8,8,100.0,10,5,2024-04-18,High
4,EMP0005,32,6,5,Male,High School,Divorced,Yes,Rural,Engineering,...,86.4,67.5,95.0,10,0,100.0,3,4,2024-02-19,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,EMP1496,53,5,5,Male,Master Degree,Married,No,Urban,HR,...,84.1,58.3,89.4,9,0,71.4,8,5,2024-03-18,High
1496,EMP1497,53,9,3,Non-binary,Master Degree,Married,Yes,Suburban,Finance,...,74.7,54.2,87.2,3,11,95.8,8,8,2024-05-13,High
1497,EMP1498,45,2,4,Male,Associate Degree,Single,Yes,Suburban,Engineering,...,88.4,71.2,95.0,11,34,100.0,4,5,2024-04-15,High
1498,EMP1499,44,12,4,Female,Bachelor Degree,Single,Yes,Suburban,Operations,...,93.9,88.6,95.0,10,6,100.0,3,6,2024-02-21,Medium


In [2]:
# Remove label column and unecessary features
X = df.drop(['Internet_Speed_Category','Response_Quality', 'Productivity_Score', 'Employee_ID', 'Survey_Date'], axis=1)
y = df['Productivity_Score']

# 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Separate categorical and numerical features
cat_features = X_train.select_dtypes(include='object').columns.tolist()
num_features = X_train.select_dtypes(exclude='object').columns.tolist()

# Numeric pipeline for scaler
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline for onehot encoding
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

# Add regressor model
reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [4]:
from sklearn.model_selection import GridSearchCV

# hyperparameter grid for Random Forest Regressor
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5]
}

# GridSearchCV
grid = GridSearchCV(
    reg,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error'
)

grid.fit(X_train, y_train)

print("Best hyperparameters for Random Forest Regressor:")
print(grid.best_params_)
print(grid.best_score_)

Best hyperparameters for Random Forest Regressor:
{'regressor__max_depth': None, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
-17.101879500208327


In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

best_model = grid.best_estimator_
y_test_pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mae = mean_absolute_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print("RMSE:", rmse)
print("MAE:", mae)
print("R2 Score:", r2)

RMSE: 3.968202065482554
MAE: 3.100668333333335
R2 Score: 0.923148281565124


# Evaluation Interpretation

## MAE = 3.101
* The average size of the errors were off by about 3.
* When the productivity is on the range of 35-98, a model with an error of about 3 is great.
* Large mistakes aren't penalized however.

## RMSE = 3.9682
* Larger errors are penalized, thus large errors increase RMSE score.
* Because RMSE is higher than MAE, MAE failed to showcase bad and far off predictions.

## R2 = 0.9231
* The model does a good job at detecting the variances that determines worker productivity.
* 92% of the time, given the worker's features (e.g. education level, WFH hours, etc.) the model is able to accurately predict the productivity_score.

In [6]:
# Extract objects
preprocessor = best_model.named_steps['preprocessor']
regressor = best_model.named_steps['regressor']

# Original feature lists
num_features = preprocessor.transformers_[0][2]
cat_features = preprocessor.transformers_[1][2]

# Get transformed feature names
num_out = preprocessor.named_transformers_['num'].get_feature_names_out(num_features)
cat_out = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_features)

# Combine to full feature list
feature_names = list(num_out) + list(cat_out)

# Importances
importances = regressor.feature_importances_

# Create DF
feature_importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

feature_importances_df


Unnamed: 0,Feature,Importance
7,Efficiency_Rating,0.505085
5,Quality_Score,0.378302
4,Task_Completion_Rate,0.060895
6,Innovation_Score,0.011977
9,Commute_Time_Minutes,0.003161
...,...,...
59,Industry_Non-profit,0.000121
15,Gender_Non-binary,0.000114
70,Manager_Support_Level_Very Low,0.000063
21,Education_Level_Professional Degree,0.000031
