# All Imports

In [13]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error

# Load Dataset from Drive

In [10]:
os.chdir('/content/drive/MyDrive/Colab/Datasets/')
df = pd.read_csv('job_satisfaction.csv')
df

Unnamed: 0,Gender,Education_Level,Age,Years_of_Experience,Hours_Worked_Per_Week,Salary,Job_Satisfaction
0,Male,Master,41,15,42,87446,8
1,Female,Bachelor,36,22,34,73666,10
2,Male,Bachelor,41,14,41,113796,10
3,Male,PhD,16,29,36,53834,14
4,Male,Bachelor,37,18,41,53205,8
...,...,...,...,...,...,...,...
95,Female,Bachelor,30,8,45,23961,7
96,Female,Bachelor,39,21,55,39696,9
97,Female,PhD,44,15,50,97337,7
98,Female,High School,43,10,62,102899,6


# One-Hot Encoding for Gender & Educational Level

In [8]:
# Run One-Hot Encoding for Gender and Educational level
df_encoded = pd.get_dummies(df, columns=['Gender', 'Education_Level'], drop_first=True)
df_encoded

Unnamed: 0,Age,Years_of_Experience,Hours_Worked_Per_Week,Salary,Job_Satisfaction,Gender_Male,Education_Level_High School,Education_Level_Master,Education_Level_PhD
0,41,15,42,87446,8,True,False,True,False
1,36,22,34,73666,10,False,False,False,False
2,41,14,41,113796,10,True,False,False,False
3,16,29,36,53834,14,True,False,False,True
4,37,18,41,53205,8,True,False,False,False
...,...,...,...,...,...,...,...,...,...
95,30,8,45,23961,7,False,False,False,False
96,39,21,55,39696,9,False,False,False,False
97,44,15,50,97337,7,False,False,False,True
98,43,10,62,102899,6,False,True,False,False


# Data Splitting

In [11]:
# Splitting the data
x = df_encoded.drop('Job_Satisfaction', axis=1)
y = df_encoded['Job_Satisfaction']

# Split 80% Training, 10% Validation, 10% Test.
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

# Check sizes
print(f'Training set: {x_train.shape}')
print(f'Validation set: {x_val.shape}')
print(f'Test set: {x_test.shape}')

Training set: (80, 8)
Validation set: (10, 8)
Test set: (10, 8)


# Model Training & Selection

In [19]:
# Define penalty values for Lasso & Ridge
penalties_lasso = [0.01, 0.2, 0.3, 3, 0.7]
penalties_ridge = [0.1, 1.2, 0.3, 2.3, 1]

best_penalty_lasso = None
best_mse_lasso = float('inf')

best_penalty_ridge = None
best_mse_ridge = float('inf')

# Run the penalties through the Lasso model and find the best penalty
for penalty in penalties_lasso:
    # Lasso
    lasso_model = Lasso(alpha=penalty)
    lasso_model.fit(x_train, y_train)

    y_val_pred_lasso = lasso_model.predict(x_val)
    mse_lasso = mean_squared_error(y_val, y_val_pred_lasso)

    print(f'Lasso - Penalty: {penalty}, MSE: {mse_lasso:.2f}')

    if mse_lasso < best_mse_lasso:
        best_mse_lasso = mse_lasso
        best_penalty_lasso = penalty

print(f'Best Lasso Penalty: {best_penalty_lasso}, Best MSE: {best_mse_lasso:.2f}')

print()

# Run the penalties through the Ridge model and find the best penalty
for penalty in penalties_ridge:
    # Ridge
    ridge_model = Ridge(alpha=penalty)
    ridge_model.fit(x_train, y_train)

    y_val_pred_ridge = ridge_model.predict(x_val)
    mse_ridge = mean_squared_error(y_val, y_val_pred_ridge)

    print(f'Ridge - Penalty: {penalty}, MSE: {mse_ridge:.2f}')

    if mse_ridge < best_mse_ridge:
        best_mse_ridge = mse_ridge
        best_penalty_ridge = penalty

print(f'Best Ridge Penalty: {best_penalty_ridge}, Best MSE: {best_mse_ridge:.2f}')

Lasso - Penalty: 0.01, MSE: 1.20
Lasso - Penalty: 0.2, MSE: 1.26
Lasso - Penalty: 0.3, MSE: 1.22
Lasso - Penalty: 3, MSE: 0.96
Lasso - Penalty: 0.7, MSE: 1.08
Best Lasso Penalty: 3, Best MSE: 0.96

Ridge - Penalty: 0.1, MSE: 1.18
Ridge - Penalty: 1.2, MSE: 1.18
Ridge - Penalty: 0.3, MSE: 1.18
Ridge - Penalty: 2.3, MSE: 1.18
Ridge - Penalty: 1, MSE: 1.18
Best Ridge Penalty: 0.1, Best MSE: 1.18
