In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import statsmodels.api as sm

pd.set_option('display.max_columns', None)



In [2]:
df = pd.read_csv(r'C:\Users\abhir\Projects\DS_Salary_Project\eda_data.csv')

# choose relevant columns 
df_model = df[['Rating', 'Size', 'Type of ownership', 'Industry', 'Sector', 'Revenue', 
               'hourly', 'employer_provided', 'job_state', 'same_state', 'age', 'python_yn',
               'R_yn', 'spark', 'aws','excel', 'job_simple', 'seniority', 'desc_len', 
               'num_competitors', 'avg_salary']]

# get dummy data 
df_dum = pd.get_dummies(df_model)

In [3]:
X = df_dum.drop('avg_salary', axis =1)
y = df_dum.avg_salary.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.preprocessing import StandardScaler

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Use X_train_scaled and X_test_scaled in your models instead of X_train and X_test


In [None]:
# multiple linear regression using OLS
# Prepare data for statsmodels
X_train_sm = sm.add_constant(X_train_scaled)  # Add constant for intercept
X_train_sm = X_train_sm.astype(float)   # Ensure float type
y_train = y_train.astype(float)          # Ensure float type

# Fit the model
model = sm.OLS(y_train, X_train_sm).fit()

# Print summary of the model
print(model.summary())


In [5]:
# Initialize a DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'MAE', 'MSE', 'RMSE', 'R-squared'])

# Multiple Linear Regression
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)
y_pred = lm.predict(X_test_scaled)

# Evaluate
results = pd.concat([results, pd.DataFrame([{
    'Model': 'Linear Regression',
    'MAE': mean_absolute_error(y_test, y_pred),
    'MSE': mean_squared_error(y_test, y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
    'R-squared': r2_score(y_test, y_pred)
}])], ignore_index=True)


  results = pd.concat([results, pd.DataFrame([{


In [7]:
# Lasso Regression
lasso = Lasso()
param_grid = {'alpha': [0.09, 0.1, 0.11, 0.12, 0.13]}
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

best_lasso = grid_search.best_estimator_
y_pred_lasso = best_lasso.predict(X_test_scaled)

results = pd.concat([results, pd.DataFrame([{
    'Model': 'Lasso',
    'MAE': mean_absolute_error(y_test, y_pred_lasso),
    'MSE': mean_squared_error(y_test, y_pred_lasso),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_lasso)),
    'R-squared': r2_score(y_test, y_pred_lasso)
}])], ignore_index=True)


In [8]:
# Ridge Regression
ridge = Ridge()
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_test_scaled)

results = pd.concat([results, pd.DataFrame([{
    'Model': 'Ridge',
    'MAE': mean_absolute_error(y_test, y_pred_ridge),
    'MSE': mean_squared_error(y_test, y_pred_ridge),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_ridge)),
    'R-squared': r2_score(y_test, y_pred_ridge)
}])], ignore_index=True)


In [9]:
# Random Forest Regression
rf = RandomForestRegressor(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train_scaled, y_train)

y_pred_rf = grid_search_rf.best_estimator_.predict(X_test_scaled)

results = pd.concat([results, pd.DataFrame([{
    'Model': 'Random Forest',
    'MAE': mean_absolute_error(y_test, y_pred_rf),
    'MSE': mean_squared_error(y_test, y_pred_rf),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_rf)),
    'R-squared': r2_score(y_test, y_pred_rf)
}])], ignore_index=True)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [10]:
# XGBoost Regression
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train_scaled, y_train)
y_pred_xgb = xgb.predict(X_test_scaled)

results = pd.concat([results, pd.DataFrame([{
    'Model': 'XGBoost',
    'MAE': mean_absolute_error(y_test, y_pred_xgb),
    'MSE': mean_squared_error(y_test, y_pred_xgb),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_xgb)),
    'R-squared': r2_score(y_test, y_pred_xgb)
}])], ignore_index=True)


In [11]:
# Print results
print(results)


               Model           MAE           MSE          RMSE     R-squared
0  Linear Regression  1.344454e+13  1.952222e+28  1.397219e+14 -1.194377e+25
1              Lasso  1.828166e+01  6.462555e+02  2.542156e+01  6.046185e-01
2              Ridge  1.914069e+01  7.120206e+02  2.668371e+01  5.643831e-01
3      Random Forest  1.102117e+01  3.420071e+02  1.849343e+01  7.907588e-01
4            XGBoost  9.008133e+00  3.449503e+02  1.857284e+01  7.889581e-01
