<a href="https://colab.research.google.com/github/21amY26/employee-salary-prediction/blob/main/02_model_eval_select.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import files
uploaded = files.upload()

Saving prep_emp_sal_data.csv to prep_emp_sal_data.csv


In [5]:
import pandas as pd
df = pd.read_csv('prep_emp_sal_data.csv')

In [6]:
y = df['Salary (USD)']

x=df.drop('Salary (USD)', axis=1)

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x,y,test_size=0.2, random_state=42
)

In [21]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import numpy as np

models ={
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100,random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100,random_state=42)
}

In [23]:
results={}
print("Model Evaluation:")
for name,model in models.items():
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)

  mae=mean_absolute_error(y_test,y_pred)
  # Calculate RMSE by taking the square root of the MSE
  rmse=np.sqrt(mean_squared_error(y_test,y_pred))
  r2=r2_score(y_test,y_pred)

  # Changed 'results' to 'result'
  results[name]={"MAE": mae, "RMSE": rmse, "R2": r2}
  print(f"{name}:\n\t\tMAE={mae:.2f}, RMSE={rmse:.2f}, R2={r2:.4f}")

Model Evaluation:
Linear Regression:
		MAE=9323.63, RMSE=11814.07, R2=0.7740
Decision Tree:
		MAE=13352.73, RMSE=18588.68, R2=0.4404
Random Forest:
		MAE=9031.21, RMSE=12461.22, R2=0.7485
XGBoost:
		MAE=9185.51, RMSE=13265.52, R2=0.7150


In [24]:
best_model = max(results,
                 key=lambda x: results[x]["R2"])
print(f"Best Model: {best_model}")
base_best_model = models['best_model']

Best Model: Linear Regression


In [43]:
#feature importance
def feature_importance(base_best_model):
  if hasattr(base_best_model, "feat_imp_"):
    imp=base_best_model.feat_imp_
    feat_df=pd.DataFrame({
        "Feature": x.columns,
        "Importance":imp
    }).sort_values(by="Importances",ascending=False)

    plt.figure(figsize=(10,8))
    sns.barplot(x="Feature",y="Importance", data=feat_df.head(20))
    plt.title(f"Top 20 features - {best_model}")
    plt.tight_layout
    plt.show()

  else:
    print(f"Feature importance not applicable with {best_model}")

In [39]:
#hyperparameter tuning for random forest
param_grid={
    "n_estimators": [100,200],
    "max_depth":[None,10,20],
    "min_samples_split":[2,5]
}

grid=GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

grid.fit(x_train,y_train)
print(f"Best parameters found: {grid.best_params_}")
best_rf_model=grid.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters found: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}


In [40]:
y_pred = best_rf_model.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Tuned Random Forest Model:\n\t\t\tMAE={mae:.2f}, RMSE={rmse:.2f}, R2={r2:.4f}")

Tuned Random Forest Model:
			MAE=9031.21, RMSE=12461.22, R2=0.7485


In [42]:
#tuning xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

xgb = XGBRegressor()

param_grid = {
    'n_estimators': [100,200,300],
    'max_depth': [3,4,5,6],
    'learning_rate': [0.01,0.05,0.1],
    'subsample': [0.6,0.8,1.0],
    'colsample_bytree': [0.6,0.8,1.0],
    'gamma':[0,0.1,0.2] #min loss redn
}

rs_xgb = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=20,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rs_xgb.fit(x_train, y_train)

# Predict and evaluate
best_xgb = rs_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(x_test)

mae = mean_absolute_error(y_test, y_pred_xgb)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2 = r2_score(y_test, y_pred_xgb)

print(f"Tuned XGBoost Model:\n\t\t\tMAE={mae:.2f}, RMSE={rmse:.2f}, R2={r2:.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Tuned XGBoost Model:
			MAE=8492.42, RMSE=11761.68, R2=0.7760


In [51]:
#feature importance for rs_xgb

#importance scores as a dict
importance = best_xgb.get_booster().get_score(importance_type='gain')

importance_df = pd.DataFrame({
    'Feature': list(importance.keys()),
    'Importance': list(importance.values())
}).sort_values(by='Importance', ascending=False)

importance_df.head(10)

Unnamed: 0,Feature,Importance
65,Writing,774032448.0
60,Safety,707745792.0
36,Data Analysis,685231424.0
10,Job Title_Cloud Engineer,670401664.0
18,Job Title_Machine Learning Engineer,570128512.0
46,JIRA,487703040.0
14,Job Title_Electrician,464861280.0
30,Azure,447048384.0
23,Job Title_Research Scientist,435878624.0
56,R,411763968.0


In [53]:
joblib.dump(best_xgb,'xgboost_salary_model.pkl')
model = joblib.load('xgboost_salary_model.pkl')

**Prediction function**

In [54]:
def predict_sal(inp, model, feat_cols):
  '''
  predicting sal using trained model
  parameters: 1. inp: dict with inp features
              2. model: selected best model
              3. feat_cols: list of features used for training

  returns: salary (USD) in float
  '''
  inp_df=pd.DataFrame([inp])
  for col in feat_cols:
    if col not in inp_df.columns:
      inp_df[col]=0
  inp_df=inp_df[feat_cols]

  prediction = model.predict(inp_df)[0]
  return round(prediction,2)

In [60]:
sample = {
    'Age': 28,
    'Experience (Years)': 4,
    'Hours/Week': 40,
    'Education Level Encoded': 2,
    'Work Mode_Remote': 1,
    'Work Mode_Hybrid': 0,
    'Work Mode_Onsite': 0,
    'Job Title_Software Engineer': 1,
    'Python': 1,
    'SQL': 1,
    'Industry_IT': 1,
    # ... all other required one-hot/skill columns as 0 or 1
}

pred_sal = predict_sal(sample,best_xgb,x.columns.tolist())
print(f"Predicted Salary: ${pred_sal:.2f}")

Predicted Salary: $108333.92
