In [17]:
# imports
import pandas as pd
import numpy as np

# Load dataset 
df = pd.read_csv("Salary_Data.csv")

# Quick look
print("Shape:", df.shape)

Shape: (6704, 6)


In [18]:
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [19]:

df = df.drop(columns=["Years of Experience"], errors='ignore')
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Salary
0,32.0,Male,Bachelor's,Software Engineer,90000.0
1,28.0,Female,Master's,Data Analyst,65000.0
2,45.0,Male,PhD,Senior Manager,150000.0
3,36.0,Female,Bachelor's,Sales Associate,60000.0
4,52.0,Male,Master's,Director,200000.0


In [20]:
# handle missing
from sklearn.impute import SimpleImputer

cat_cols = ["Age", "Gender", "Education Level", "Job Title"]
target_col = "Salary"

# drop rows where Salary missing (recommended)
missing_salary_count = df[target_col].isna().sum()
print(f"Rows with missing Salary: {missing_salary_count}")
df = df[~df[target_col].isna()].copy()

# For categorical columns, fill missing with most frequent
cat_imputer = SimpleImputer(strategy="most_frequent")
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

print("Missing values after imputation/dropping:")
print(df.isna().sum())


Rows with missing Salary: 5
Missing values after imputation/dropping:
Age                0
Gender             0
Education Level    0
Job Title          0
Salary             0
dtype: int64


In [21]:
# train test split
from sklearn.model_selection import train_test_split

X = df[cat_cols]  
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)


Train shape: (5359, 4) (5359,)
Test shape: (1340, 4) (1340,)


In [22]:
# preprocessing pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("cat", categorical_transformer, cat_cols)
], remainder="drop")  # drop any other columns

# Example: fit and transform on training set to see feature shape
X_train_prep = preprocessor.fit_transform(X_train)
print("Transformed X_train shape:", X_train_prep.shape)


Transformed X_train shape: (5359, 219)


In [23]:
# models setup
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42, n_jobs=-1),
    "XGBoost": XGBRegressor(random_state=42, n_jobs=-1, verbosity=0),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
}

print("Models to evaluate:", list(models.keys()))


Models to evaluate: ['LinearRegression', 'Ridge', 'RandomForest', 'XGBoost', 'GradientBoosting']


In [24]:

from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Create a pipeline per model that includes preprocessing
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    "MAE": "neg_mean_absolute_error",
    "MSE": "neg_mean_squared_error",
    "R2": "r2"
}

results = {}
for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    cv_res = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
    mae = -np.mean(cv_res["test_MAE"])
    mse = -np.mean(cv_res["test_MSE"])
    rmse = np.sqrt(mse)
    r2 = np.mean(cv_res["test_R2"])
    results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2}
    print(f"{name}: MAE={mae:.2f}, RMSE={rmse:.2f}, R2={r2:.3f}")

# Convert results to DataFrame for nicer display
import pandas as pd
results_df = pd.DataFrame(results).T[["MAE", "RMSE", "R2"]].sort_values("RMSE")
print("\nSummary:\n", results_df)


LinearRegression: MAE=13301.29, RMSE=19067.96, R2=0.869
Ridge: MAE=13684.19, RMSE=19210.71, R2=0.867
RandomForest: MAE=5662.18, RMSE=12278.14, R2=0.946
XGBoost: MAE=7812.27, RMSE=12619.95, R2=0.943
GradientBoosting: MAE=18422.34, RMSE=24119.54, R2=0.790

Summary:
                            MAE          RMSE        R2
RandomForest       5662.180992  12278.142210  0.945567
XGBoost            7812.273137  12619.948579  0.942537
LinearRegression  13301.287971  19067.958277  0.868844
Ridge             13684.186299  19210.714121  0.866863
GradientBoosting  18422.343177  24119.540202  0.790028


In [25]:
# evaluate_all_models_on_test

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

test_results = {}

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)             # fit on training set only
    y_pred = pipe.predict(X_test)          # evaluate on test set

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    test_results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2}
    print(f"{name}:  MAE={mae:.2f} | RMSE={rmse:.2f} | R2={r2:.3f}")

#  summary table
test_results_df = pd.DataFrame(test_results).T.sort_values("RMSE")
print("\n=== Test set comparison ===")
print(test_results_df)


LinearRegression:  MAE=13099.69 | RMSE=18636.26 | R2=0.878
Ridge:  MAE=13383.41 | RMSE=18671.68 | R2=0.877
RandomForest:  MAE=4851.63 | RMSE=10292.76 | R2=0.963
XGBoost:  MAE=6893.90 | RMSE=10902.53 | R2=0.958
GradientBoosting:  MAE=18285.82 | RMSE=23925.58 | R2=0.799

=== Test set comparison ===
                           MAE          RMSE        R2
RandomForest       4851.633762  10292.759522  0.962736
XGBoost            6893.897224  10902.530490  0.958190
LinearRegression  13099.686340  18636.258713  0.877835
Ridge             13383.413088  18671.675426  0.877370
GradientBoosting  18285.817647  23925.576779  0.798649


In [26]:
# hyperparameter_tuning_all_models

from sklearn.model_selection import RandomizedSearchCV, KFold

param_grids = {
    "Ridge": {
        "model__alpha": [0.01, 0.1, 1, 10, 100]
    },
    "RandomForest": {
        "model__n_estimators": [100, 200, 400],
        "model__max_depth": [None, 10, 20, 40],
        "model__min_samples_split": [2, 5, 10],
        "model__min_samples_leaf": [1, 2, 4]
    },
    "XGBoost": {
        "model__n_estimators": [100, 200, 400],
        "model__learning_rate": [0.01, 0.05, 0.1],
        "model__max_depth": [3, 6, 10],
        "model__subsample": [0.7, 1.0],
        "model__colsample_bytree": [0.7, 1.0]
    },
    "GradientBoosting": {
        "model__n_estimators": [100, 200, 300],
        "model__learning_rate": [0.01, 0.05, 0.1],
        "model__max_depth": [3, 4, 5]
    }
}



cv = KFold(n_splits=5, shuffle=True, random_state=42)
tuned_models = {}
tuned_results = {}

for name, model in models.items():
    if name not in param_grids:
        print(f"Skipping {name} (no params to tune).")
        continue

    print(f"\n🔍 Tuning {name} ...")
    pipe = Pipeline([("preprocessor", preprocessor), ("model", model)])
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_grids[name],
        n_iter=10,             # increase for deeper search
        cv=cv,
        scoring="neg_mean_absolute_error",
        n_jobs=-1,
        verbose=1,
        random_state=42
    )
    search.fit(X_train, y_train)
    tuned_models[name] = search.best_estimator_

    print(f"✅ Best {name} params:", search.best_params_)
    print(f"Best CV MAE: {-search.best_score_:.2f}")

    # Evaluate tuned model on test set
    y_pred = tuned_models[name].predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    tuned_results[name] = {
        "MAE (Test)": mae,
        "RMSE (Test)": rmse,
        "R2 (Test)": r2,
        "Best Params": search.best_params_
    }

# Final comparison
tuned_results_df = pd.DataFrame(tuned_results).T.sort_values("RMSE (Test)")
print("\n=== Tuned model results on test set ===")
print(tuned_results_df)

Skipping LinearRegression (no params to tune).

🔍 Tuning Ridge ...
Fitting 5 folds for each of 5 candidates, totalling 25 fits




✅ Best Ridge params: {'model__alpha': 0.01}
Best CV MAE: 13302.50

🔍 Tuning RandomForest ...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
✅ Best RandomForest params: {'model__n_estimators': 400, 'model__min_samples_split': 2, 'model__min_samples_leaf': 1, 'model__max_depth': 40}
Best CV MAE: 5831.97

🔍 Tuning XGBoost ...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
✅ Best XGBoost params: {'model__subsample': 1.0, 'model__n_estimators': 200, 'model__max_depth': 6, 'model__learning_rate': 0.1, 'model__colsample_bytree': 0.7}
Best CV MAE: 9285.58

🔍 Tuning GradientBoosting ...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
✅ Best GradientBoosting params: {'model__n_estimators': 300, 'model__max_depth': 5, 'model__learning_rate': 0.05}
Best CV MAE: 11981.15

=== Tuned model results on test set ===
                    MAE (Test)   RMSE (Test) R2 (Test)  \
RandomForest       4964.903802  10256.787523  0.962996   
XGBoost            8496.831504  1

In [27]:
import pickle

# Assuming you still have your tuned Random Forest pipeline as `best_rf_model`
best_rf_model = tuned_models["RandomForest"]  # from your tuning dictionary

with open("rf_salary_model.pkl", "wb") as f:
    pickle.dump(best_rf_model, f)

print("✅ Re-saved the correct RandomForest pipeline model!")


✅ Re-saved the correct RandomForest pipeline model!
