In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
#importing the dataset from excel

data = pd.read_csv('employee_salary.csv')
data.head(5)

Unnamed: 0,Experience,Education,Age,Location,Role,Salary
0,7,Bachelor's,56,New York,Software Engineer,78856.82
1,20,High School,28,Austin,Software Engineer,71947.20346
2,29,PhD,42,Chicago,Software Engineer,111659.840176
3,15,High School,39,Chicago,Software Engineer,70258.290207
4,11,High School,24,New York,Software Engineer,77946.277878


In [3]:
#one-hot encode the categorical variables
data_encoded = pd.get_dummies(data, columns=["Education", "Location", "Role"], drop_first=True)

In [4]:
# checking all the columns after one hot encoding
print("📋 All columns in the encoded dataset:\n")
for col in data_encoded.columns:
    print("-", col)

📋 All columns in the encoded dataset:

- Experience
- Age
- Salary
- Education_High School
- Education_Master's
- Education_PhD
- Location_Chicago
- Location_New York
- Location_San Francisco
- Role_HR
- Role_Manager
- Role_Software Engineer


In [5]:
#checking the characteristic of the dataset 
print('Descriptive statistics:')
data_encoded.describe()

Descriptive statistics:


Unnamed: 0,Experience,Age,Salary,Education_High School,Education_Master's,Education_PhD,Location_Chicago,Location_New York,Location_San Francisco,Role_HR,Role_Manager,Role_Software Engineer
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,15.036,40.872,92787.729972,0.262667,0.243333,0.236667,0.247333,0.250667,0.241333,0.256,0.252667,0.238
std,8.629273,10.949622,22744.13108,0.44023,0.429238,0.425178,0.431606,0.433541,0.428035,0.436567,0.434686,0.426001
min,1.0,22.0,23521.509071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.0,32.0,75828.432563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15.0,41.0,92792.24105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,23.0,50.0,108742.245265,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
max,29.0,59.0,152270.490382,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
#checking the correlation of the variable
print('Correlation matrix:')
data_encoded_corr = data_encoded.corr()
data_encoded_corr

Correlation matrix:


Unnamed: 0,Experience,Age,Salary,Education_High School,Education_Master's,Education_PhD,Location_Chicago,Location_New York,Location_San Francisco,Role_HR,Role_Manager,Role_Software Engineer
Experience,1.0,-0.007915,0.763145,0.020163,-0.013893,0.027496,-0.038216,-0.043783,0.058874,-0.000146,0.002197,0.011278
Age,-0.007915,1.0,0.040568,0.038395,-0.025589,-0.014696,-0.005295,-0.004901,-0.018883,-0.019656,-0.008198,-0.001188
Salary,0.763145,0.040568,1.0,-0.156626,0.035554,0.212041,-0.091979,0.003344,0.190151,-0.20876,0.208625,-0.06863
Education_High School,0.020163,0.038395,-0.156626,1.0,-0.338469,-0.33234,0.008955,-0.009656,-0.014463,-0.020355,-0.012378,0.01504
Education_Master's,-0.013893,-0.025589,0.035554,-0.338469,1.0,-0.315762,0.006206,-0.034032,0.025102,0.026914,-0.022251,0.007771
Education_PhD,0.027496,-0.014696,0.212041,-0.33234,-0.315762,1.0,0.007986,0.036239,-0.050121,0.014807,0.015533,-0.016537
Location_Chicago,-0.038216,-0.005295,-0.091979,0.008955,0.006206,0.007986,1.0,-0.331551,-0.323313,-0.014077,-0.00974,0.042458
Location_New York,-0.043783,-0.004901,0.003344,-0.009656,-0.034032,0.036239,-0.331551,1.0,-0.326207,0.027295,-0.003549,-0.012599
Location_San Francisco,0.058874,-0.018883,0.190151,-0.014463,0.025102,-0.050121,-0.323313,-0.326207,1.0,-0.027389,0.02343,-0.022522
Role_HR,-0.000146,-0.019656,-0.20876,-0.020355,0.026914,0.014807,-0.014077,0.027295,-0.027389,1.0,-0.341075,-0.327827


In [12]:
#assigining the variables
X = data_encoded.drop("Salary", axis=1)
y = data_encoded["Salary"]

In [13]:
#splitting to training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state = 45)

In [19]:
#adding constant to the independent variable
X_train_constant = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

In [15]:
#fitting the model
model = sm.OLS(y_train, X_train_constant).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.744
Model:                            OLS   Adj. R-squared:                  0.742
Method:                 Least Squares   F-statistic:                     314.1
Date:                Sat, 07 Jun 2025   Prob (F-statistic):               0.00
Time:                        19:13:37   Log-Likelihood:                -12926.
No. Observations:                1200   AIC:                         2.588e+04
Df Residuals:                    1188   BIC:                         2.594e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                   5.44

In [21]:
# 1. Predict on the test set 
y_pred = model.predict(X_test_const)

In [22]:
# Evaluate the model performance on test set
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [23]:
#calculating the RMSE
rmse = np.sqrt(np.mean(residuals**2))
print(f"RMSE: {rmse}")

RMSE: 11534.216732753952


In [None]:
#1 A positive coefficient signifies an increase in salary, whereas a negative coefficient corresponds to a decrease. When comparing the variables, obtaining a PhD degree and being located in San Francisco are the most effective ways to boost your salary. Conversely, the role with the strongest negative impact on salary is in Human Resources.

In [None]:
#2 When it comes to experience, the coefficient of +2005 means that for every extra year you gain, your salary goes up by about $2,000. Location also has a big impact on salary because places with a higher cost of living usually offer higher pay to match. Education is clearly important too, since the p-value is 0.00, which is below to the alpha level of 0.05, showing that having a higher level of education really makes a difference in earning a higher salary.