In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("cleaned_regression_data.csv")  # the file you just downloaded
df.head()


Unnamed: 0,Age,Experience_Years,Gender_Male,City_Delhi,City_New York,Education_High School,Education_Masters,Salary
0,22.0,5.0,1,0,1,0,1,50000.0
1,30.0,4.0,0,0,1,1,0,60000.0
2,25.0,7.0,0,1,0,1,0,50000.0
3,28.0,4.0,0,0,1,0,0,60000.0
4,40.0,3.0,1,0,1,0,1,50000.0


In [2]:
df.shape
df.info()
df.describe().T


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    100 non-null    float64
 1   Experience_Years       100 non-null    float64
 2   Gender_Male            100 non-null    int64  
 3   City_Delhi             100 non-null    int64  
 4   City_New York          100 non-null    int64  
 5   Education_High School  100 non-null    int64  
 6   Education_Masters      100 non-null    int64  
 7   Salary                 100 non-null    float64
dtypes: float64(3), int64(5)
memory usage: 6.4 KB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,100.0,28.98,5.493761,22.0,25.0,28.0,35.0,40.0
Experience_Years,100.0,3.85,1.799972,1.0,2.0,4.0,5.0,7.0
Gender_Male,100.0,0.55,0.5,0.0,0.0,1.0,1.0,1.0
City_Delhi,100.0,0.18,0.386123,0.0,0.0,0.0,0.0,1.0
City_New York,100.0,0.6,0.492366,0.0,0.0,1.0,1.0,1.0
Education_High School,100.0,0.2,0.402015,0.0,0.0,0.0,0.0,1.0
Education_Masters,100.0,0.24,0.429235,0.0,0.0,0.0,0.0,1.0
Salary,100.0,49700.0,14595.903673,30000.0,37500.0,50000.0,60000.0,70000.0


In [3]:
df.isnull().sum()


Age                      0
Experience_Years         0
Gender_Male              0
City_Delhi               0
City_New York            0
Education_High School    0
Education_Masters        0
Salary                   0
dtype: int64

In [4]:
df.columns.tolist()


['Age',
 'Experience_Years',
 'Gender_Male',
 'City_Delhi',
 'City_New York',
 'Education_High School',
 'Education_Masters',
 'Salary']

In [5]:
X = df.drop(columns=['Salary'])
y = df['Salary']


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)


In [10]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def regression_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

metrics_lr = regression_metrics(y_test, y_pred_lr)
metrics_rf = regression_metrics(y_test, y_pred_rf)

print("Linear Regression:", metrics_lr)
print("Random Forest:", metrics_rf)


Linear Regression: {'MAE': 14976.064951361017, 'MSE': 274525069.54418176, 'RMSE': np.float64(16568.79807180297), 'R2': -0.07762539565920235}
Random Forest: {'MAE': 15500.708333333334, 'MSE': 299604778.7847222, 'RMSE': np.float64(17309.095261876693), 'R2': -0.1760737145622071}


In [12]:
comp = X_test.copy()
comp['Actual'] = y_test.values
comp['Pred_LR'] = y_pred_lr
comp['Pred_RF'] = y_pred_rf
comp[['Actual','Pred_LR','Pred_RF']].head(10)


Unnamed: 0,Actual,Pred_LR,Pred_RF
83,50000.0,40601.406625,47697.5
53,60000.0,42035.252354,45100.0
70,30000.0,51125.54871,54260.0
45,70000.0,53371.65831,45600.0
44,40000.0,51816.262029,50791.666667
39,70000.0,47901.85211,60450.0
22,60000.0,43475.296315,51840.0
80,30000.0,52354.235981,54468.333333
10,30000.0,53929.138648,46325.0
0,50000.0,52330.522324,57235.0


In [13]:
importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)
feat_imp


Age                      0.297256
Experience_Years         0.284334
City_New York            0.105196
Gender_Male              0.095955
Education_Masters        0.089010
Education_High School    0.067773
City_Delhi               0.060475
dtype: float64

In [14]:
import joblib
joblib.dump(rf, "rf_salary_model.joblib")


['rf_salary_model.joblib']