## Model Training

### Importing necessary libraries

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

### Importing the Data

In [3]:
df = pd.read_csv('data/processed_data.csv')

### Show Top 5 records

In [4]:
df.head(5)

Unnamed: 0,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,salary
0,CFO,MASTERS,MATH,HEALTH,10,83,130
1,CEO,HIGH_SCHOOL,NONE,WEB,3,73,101
2,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38,137
3,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17,142
4,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16,163


### Verify column data types

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   jobType              1000000 non-null  object
 1   degree               1000000 non-null  object
 2   major                1000000 non-null  object
 3   industry             1000000 non-null  object
 4   yearsExperience      1000000 non-null  int64 
 5   milesFromMetropolis  1000000 non-null  int64 
 6   salary               1000000 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 53.4+ MB


### Preparing X and y variables

In [6]:
X = df.drop('salary', axis=1)
y = df['salary']

In [7]:
X.head()

Unnamed: 0,jobType,degree,major,industry,yearsExperience,milesFromMetropolis
0,CFO,MASTERS,MATH,HEALTH,10,83
1,CEO,HIGH_SCHOOL,NONE,WEB,3,73
2,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38
3,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17
4,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16


In [8]:
# Creaate a Column Transformer with 2 types of transformers
num_features = X.select_dtypes(exclude = ['object']).columns
cat_features = X.select_dtypes(include = ['object']).columns

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [9]:
X = preprocessor.fit_transform(X)

In [10]:
X.shape

(1000000, 31)

In [11]:
# Prepare the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800000, 31), (200000, 31), (800000,), (200000,))

In [12]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2 = r2_score(true, predicted)
    return mae, mse, rmse, r2

#### Note:- For Training purpose, we are going to use only 10,000 records since the dataset is very large and it will take a long time to train the model

In [13]:
models = {
    "Linear Regression" : LinearRegression(),
    "KNeighbors" : KNeighborsRegressor(),
    "RandomForest" : RandomForestRegressor(),
    "AdaBoost" : AdaBoostRegressor(),
    "GradientBoosting" : GradientBoostingRegressor(),
    "XGBoost" : XGBRegressor(),
    "SVR" : SVR(),
    "DecisionTree" : DecisionTreeRegressor()
}

# Create a DataFrame to store the results
train_results = pd.DataFrame(columns=['Model','MAE', 'MSE', 'RMSE', 'R2'])
test_results = pd.DataFrame(columns=['Model','MAE', 'MSE', 'RMSE', 'R2'])

subset_X_train = X_train[:10000]
subset_y_train = y_train[:10000]

for model_name, model in models.items():

    model.fit(subset_X_train, subset_y_train)

    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    train_mae, train_mse, train_rmse, train_r2 = evaluate_model(y_train, train_pred)
    test_mae, test_mse, test_rmse, test_r2 = evaluate_model(y_test, test_pred)

    train_results.loc[len(train_results)] = [model_name, train_mae, train_mse, train_rmse, train_r2]
    test_results.loc[len(test_results)] = [model_name, test_mae, test_mse, test_rmse, test_r2]

### Analyze the Train and Test results

In [14]:
train_results = train_results.sort_values(by='R2', ascending=False)
train_results

Unnamed: 0,Model,MAE,MSE,RMSE,R2
4,GradientBoosting,15.711896,378.591917,19.457439,0.747265
0,Linear Regression,15.872854,385.783795,19.64138,0.742463
5,XGBoost,16.317398,418.603668,20.459806,0.720554
6,SVR,16.304444,419.542759,20.482743,0.719927
2,RandomForest,16.472643,427.452035,20.674913,0.714647
1,KNeighbors,17.313381,478.804366,21.881599,0.680366
3,AdaBoost,20.892377,633.246387,25.164387,0.577266
7,DecisionTree,22.212238,825.593157,28.733137,0.448861


In [15]:
test_results = test_results.sort_values('R2', ascending=False)
test_results

Unnamed: 0,Model,MAE,MSE,RMSE,R2
4,GradientBoosting,15.714417,378.295213,19.449813,0.748385
0,Linear Regression,15.856204,384.918043,19.619328,0.74398
6,SVR,16.305777,419.565427,20.483296,0.720935
5,XGBoost,16.378218,420.756927,20.51236,0.720143
2,RandomForest,16.57402,430.778902,20.755214,0.713477
1,KNeighbors,17.349455,480.651448,21.923764,0.680305
3,AdaBoost,20.842157,631.147992,25.122659,0.580205
7,DecisionTree,22.452867,831.793129,28.840824,0.446751


### Hyperparameter Tuning

The approach I am choosing to find the best model is:

1. Starting with RandomizedSearchCV to quickly find promissing hyperparameters
2. Using GridSearchCV only on the best models (e.g. Top 2/3) for fine tuning

In [16]:
param_grid = {
    "Linear Regression": {
        "fit_intercept": [True, False]
    },
    "KNeighbors": {
        "n_neighbors": [3, 5, 7, 10],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    },
    "RandomForest": {
        "n_estimators": [100, 200, 500],
        # "max_depth": [10, 20, 30, None],
        "min_samples_split": [2, 5, 10],
        # "min_samples_leaf": [1, 2, 4]
    },
    "AdaBoost": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 1]
    },
    "GradientBoosting": {
        "n_estimators": [100, 200, 500],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 10]
    },
    "XGBoost": {
        "n_estimators": [100, 200, 500],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 10],
        "subsample": [0.7, 0.8, 1.0],
        "colsample_bytree": [0.7, 0.8, 1.0]
    },
    "SVR": {
        "kernel": ["linear", "rbf", "poly"],
        "C": [0.1, 1, 10, 100],
        "gamma": ["scale", "auto"]
    },
    "DecisionTree": {
        "max_depth": [5, 10, 20, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }
}

#### GridSearchCV for finding best hyperparameters

In [None]:
best_models = {}

for model_name, model in models.items():
    print(f"Running GridSearchCV for {model_name}...")
    grid = GridSearchCV(model, param_grid[model_name], cv=3, n_jobs=-1, verbose=2)
    grid.fit(subset_X_train, subset_y_train)
    
    # Store the best model
    best_models[model_name] = grid.best_estimator_
print(f"Best params for {model_name}: {grid.best_params_}")

Running GridSearchCV for Linear Regression...
Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] END ................................fit_intercept=False; total time=   0.0s
[CV] END .................................fit_intercept=True; total time=   0.0s
[CV] END ................................fit_intercept=False; total time=   0.0s
[CV] END .................................fit_intercept=True; total time=   0.0s
[CV] END .................................fit_intercept=True; total time=   0.0s
[CV] END ................................fit_intercept=False; total time=   0.0s
Running GridSearchCV for KNeighbors...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   2.3s
[CV] END ...metric=euclidean, n_neighbors=7, weights=uniform; total time=   2.3s
[CV] END ...metric=euclidean, n_neighbors=7, weights=uniform; total time=   2.3s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; tota

In [None]:
best_models

## Best Model

In [None]:
model =
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = r2_score(y_test, y_pred)
print("Accuracy of the model: %.2f" %(score*100), "%")

### Plot Scatter Plot for y_pred and y_test

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")
plt.show()

### Plot regplot for y_pred and y_test

In [None]:
sns.regplot(x = y_test, y = y_pred, ci = None, color = 'red')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Best Fit Line")
plt.show()

### Difference between Actual and Predicted Values

In [None]:
pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Difference': y_test - y_pred})
pred_df.head(10)