### This model predicts the salary of the job based on different criterias like age, gender, education level, job title, and years of experience

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('job_salary/Salary_Data.csv')

In [3]:
df.head(5)

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [4]:
df.describe()

Unnamed: 0,Age,Years of Experience,Salary
count,6702.0,6701.0,6699.0
mean,33.620859,8.094687,115326.964771
std,7.614633,6.059003,52786.183911
min,21.0,0.0,350.0
25%,28.0,3.0,70000.0
50%,32.0,7.0,115000.0
75%,38.0,12.0,160000.0
max,62.0,34.0,250000.0


## Preprocessing the data and handling Missing Values

In [5]:
# check for empty values
df.isnull().any()

Age                    True
Gender                 True
Education Level        True
Job Title              True
Years of Experience    True
Salary                 True
dtype: bool

In [6]:
# quickly get rid of empty rows
df = df.dropna()

In [7]:
df.describe()

Unnamed: 0,Age,Years of Experience,Salary
count,6698.0,6698.0,6698.0
mean,33.623022,8.095178,115329.253061
std,7.615784,6.060291,52789.792507
min,21.0,0.0,350.0
25%,28.0,3.0,70000.0
50%,32.0,7.0,115000.0
75%,38.0,12.0,160000.0
max,62.0,34.0,250000.0


## Feature Selection

### Correlation analysis

In [8]:
df.corr()['Salary']

Age                    0.728061
Years of Experience    0.808968
Salary                 1.000000
Name: Salary, dtype: float64

In [9]:
X = df[['Age', 'Years of Experience']]
y = df['Salary']

In [10]:
X

Unnamed: 0,Age,Years of Experience
0,32.0,5.0
1,28.0,3.0
2,45.0,15.0
3,36.0,7.0
4,52.0,20.0
...,...,...
6699,49.0,20.0
6700,32.0,3.0
6701,30.0,4.0
6702,46.0,14.0


## Splitting the data into Training Set and Testing Set

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [12]:
# verifying if the split was correct
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (5358, 2)
y_train shape: (5358,)
X_test shape: (1340, 2)
y_test shape: (1340,)


## Feature Scaling

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## Model Building using Linear Regression

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [15]:
# training the model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [16]:
y_pred = model.predict(X_test)
print(y_pred)

[119544.80211762  90793.77366524  67766.11157298 ... 106711.92535625
 123256.05074191  83684.26326398]


### Evaluating the model

In [17]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared Score:", r2)

Mean Squared Error (MSE): 957847950.5048134
Mean Absolute Error (MAE): 24924.559436448526
R-squared Score: 0.664084994028991


### Interpreting the results

In [18]:
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print(coefficients)

               Feature   Coefficient
0                  Age -13081.380740
1  Years of Experience  54823.533073


### now let's see how much salary prediction do I get for myself

In [19]:
# replace 21 with your age and 3 with your Years of Experience to make prediction for your salary
my_X = np.array([21,3]).reshape(-1, 2) 
my_salary = model.predict(my_X)

In [20]:
print(f"The salary of a person who's {my_X[0,0]} years old and has {my_X[0,1]} years of work experience should be: ${round(my_salary[0])}")

The salary of a person who's 21 years old and has 3 years of work experience should be: $5247
