In [34]:
#importing required libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut, KFold, LeaveOneGroupOut
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
df = pd.read_csv('Salary_Data.csv')

In [3]:
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [4]:
df.isnull().sum()

Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,Age,Years of Experience,Salary
count,6698.0,6698.0,6698.0
mean,33.623022,8.095178,115329.253061
std,7.615784,6.060291,52789.792507
min,21.0,0.0,350.0
25%,28.0,3.0,70000.0
50%,32.0,7.0,115000.0
75%,38.0,12.0,160000.0
max,62.0,34.0,250000.0


In [8]:
X_reg = df.drop("Salary", axis=1)
y_reg = df["Salary"]

In [9]:
# Perform one-hot encoding for categorical variables
data_encoded = pd.get_dummies(df, columns=["Gender", "Job Title", "Education Level"])

In [10]:
# Split into features (X) and target variable (y) for regression
X_reg = data_encoded.drop("Salary", axis=1)
y_reg = data_encoded["Salary"]

In [13]:
# Split into features (X) and target variable (y) for classification
salary_threshold = data_encoded["Salary"].mean()
data_encoded["Salary_Class"] = np.where(data_encoded["Salary"] >= salary_threshold, "High", "Low")
X_cls = data_encoded.drop(["Salary", "Salary_Class"], axis=1)
y_cls = data_encoded["Salary_Class"]

In [14]:
# Train-test split for regression
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

In [16]:
# Decision Tree Regression
reg_model = DecisionTreeRegressor()

In [17]:
# Fit the model
reg_model.fit(X_reg_train, y_reg_train)

DecisionTreeRegressor()

In [18]:
# Predict on test set
y_reg_pred = reg_model.predict(X_reg_test)

In [15]:
# Train-test split for classification
X_cls_train, X_cls_test, y_cls_train, y_cls_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

In [19]:
# Decision Tree Classification
cls_model = DecisionTreeClassifier()

In [20]:
# Fit the model
cls_model.fit(X_cls_train, y_cls_train)

DecisionTreeClassifier()

In [21]:
# Predict on test set
y_cls_pred = cls_model.predict(X_cls_test)

In [24]:
# Regression

# Train/test split evaluation
reg_mse = mean_squared_error(y_reg_test, y_reg_pred)

In [25]:
# k-Fold Cross-Validation evaluation
kfold_scores = cross_val_score(reg_model, X_reg, y_reg, cv=5, scoring='neg_mean_squared_error')
reg_mse_kfold = -kfold_scores.mean()

In [26]:
# Leave-One-Out Cross-Validation evaluation
loo = LeaveOneOut()
loo_scores = cross_val_score(reg_model, X_reg, y_reg, cv=loo, scoring='neg_mean_squared_error')
reg_mse_loo = -loo_scores.mean()

In [27]:
# Classification

# Train/test split evaluation
cls_report = classification_report(y_cls_test, y_cls_pred)

In [28]:
# k-Fold Cross-Validation evaluation
kfold_scores = cross_val_score(cls_model, X_cls, y_cls, cv=5, scoring='accuracy')
cls_accuracy_kfold = kfold_scores.mean()

In [29]:
# Leave-One-Out Cross-Validation evaluation
loo = LeaveOneOut()
loo_scores = cross_val_score(cls_model, X_cls, y_cls, cv=loo, scoring='accuracy')
cls_accuracy_loo = loo_scores.mean()

In [33]:
# Print the results

print("Regression Model Evaluation:")
print("\nMSE (Train/Test Split):", reg_mse)
print("MSE (k-Fold CV):", reg_mse_kfold)
print("MSE (Leave-One-Out CV):", reg_mse_loo)

print("\nClassification Model Evaluation:")

print("\nClassification Report (Train/Test Split):\n", cls_report)
print("Accuracy Score (k-Fold CV):", cls_accuracy_kfold)
print("Accuracy Score (Leave-One-Out CV):", cls_accuracy_loo)


Regression Model Evaluation:

MSE (Train/Test Split): 63825986.35991665
MSE (k-Fold CV): 804454635.5022757
MSE (Leave-One-Out CV): 58128824.002516374

Classification Model Evaluation:

Classification Report (Train/Test Split):
               precision    recall  f1-score   support

        High       0.98      0.98      0.98       655
         Low       0.98      0.98      0.98       685

    accuracy                           0.98      1340
   macro avg       0.98      0.98      0.98      1340
weighted avg       0.98      0.98      0.98      1340

Accuracy Score (k-Fold CV): 0.8686280695105504
Accuracy Score (Leave-One-Out CV): 0.9825320991340699


Regression Model Evaluation:
- MSE (Train/Test Split): 63825986.35991665
- MSE (k-Fold CV): 804454635.5022757
- MSE (Leave-One-Out CV): 58128824.002516374

The regression model shows a lower mean squared error (MSE) value for the Train/Test Split evaluation compared to the k-Fold CV and Leave-One-Out CV evaluations. This suggests that the model may be overfitting to the training data, resulting in poor performance on unseen data. The higher MSE values in k-Fold CV and Leave-One-Out CV indicate that the model's performance is more consistent across different folds or leave-one-out iterations, providing a more reliable estimate of the model's generalization performance.

Classification Model Evaluation:
- Classification Report (Train/Test Split):
  - Precision: 0.98
  - Recall: 0.98
  - F1-Score: 0.98
  - Accuracy: 0.98

- Accuracy Score (k-Fold CV): 0.8686280695105504
- Accuracy Score (Leave-One-Out CV): 0.9825320991340699

The classification model shows high precision, recall, and F1-score values for both classes (High and Low) in the Train/Test Split evaluation, indicating a good classification performance. The accuracy score of 0.98 also suggests a high overall accuracy of the model. However, the k-Fold CV evaluation shows a lower accuracy score, indicating that the model's performance may vary across different folds. The Leave-One-Out CV evaluation demonstrates high accuracy, suggesting that the model generalizes well to unseen data when only one sample is left out at a time.

In conclusion, the regression model shows signs of overfitting, as it performs better on the training data compared to unseen data. The classification model demonstrates consistent performance across different validation techniques, with the Train/Test Split and Leave-One-Out CV evaluations showing high accuracy and precision. Therefore, the well-balanced model, in this case, is the classification model evaluated using the Train/Test Split and Leave-One-Out CV techniques.