# Linear Regression (Baseline Model)
In this notebook we will use the linear regression to predict salaries.

### Importing the libraries and Dataset preparation

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

path = 'Salary_Data.csv'

df = pd.read_csv(path)
df = df.dropna()
df = df.dropna(subset=['Salary'])

df['Education Level'] = df['Education Level'].replace("Bachelor's Degree", "Bachelor's")
df['Education Level'] = df['Education Level'].replace("phD", "PhD")
df['Education Level'] = df['Education Level'].replace("Master's Degree", "Master's")

# Remove Jobs with less than 100 data points
df = df.groupby('Job Title').filter(lambda x: len(x) > 100)

X = df.drop('Salary', axis=1)
y = df['Salary']
df['Job Title'].value_counts()

Job Title
Software Engineer            518
Data Scientist               453
Software Engineer Manager    376
Data Analyst                 363
Senior Project Engineer      318
Product Manager              313
Full Stack Engineer          308
Marketing Manager            255
Senior Software Engineer     244
Back end Developer           244
Front end Developer          241
Marketing Coordinator        158
Junior Sales Associate       142
Financial Manager            134
Marketing Analyst            132
Software Developer           125
Operations Manager           114
Human Resources Manager      104
Name: count, dtype: int64

### Preprocessing and model definition

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = ['Age', 'Years of Experience']
categorical_features = ['Gender', 'Education Level', 'Job Title']

numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

In [19]:
X_train

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience
6208,30.0,Female,Bachelor's,Marketing Coordinator,5.0
3310,24.0,Female,Bachelor's,Software Developer,2.0
2232,43.0,Female,PhD,Senior Project Engineer,14.0
1833,30.0,Male,Bachelor's,Back end Developer,5.0
2735,34.0,Male,Master's,Data Analyst,8.0
...,...,...,...,...,...
6221,25.0,Male,Bachelor's,Product Manager,1.0
819,31.0,Male,Bachelor's,Data Analyst,9.0
3845,30.0,Male,Bachelor's,Operations Manager,5.0
4952,30.0,Male,Master's,Financial Manager,7.0


### Model training and evaluation

In [21]:
model.fit(X_train, y_train)

feature_names = (
    model.named_steps['preprocessor']
    .named_transformers_['cat']
    .named_steps['onehot']
    .get_feature_names_out(categorical_features)
    .tolist()
)
feature_names = numeric_features + feature_names

# Step 7: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 8: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean squared error: {mse:.2f}")
print(f"Mean absolute error: {np.sqrt(mse):.2f}")
print(f"R-squared score: {r2:.2f}")

# Step 9: Example of using the model for prediction
new_data = pd.DataFrame({
    'Age': [27, 27, 27],
    'Gender': ['Male', 'Male', 'Male'],
    'Education Level': ["High School", "Bachelor's", "Master's"],
    'Job Title': ['Software Engineer', 'Software Engineer', 'Software Engineer'],
    'Years of Experience': [7, 7, 7]
})

predicted_salary = model.predict(new_data)
for i, salary in enumerate(predicted_salary):
    print(f"Person {i+1} predicted salary: ${salary:.2f}")

Mean squared error: 646118680.42
Mean absolute error: 25418.86
R-squared score: 0.75
Person 1 predicted salary: $89777.25
Person 2 predicted salary: $121727.44
Person 3 predicted salary: $125961.04
