### Import required libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression as SklearnLR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Set default plotting styling rules
sns.set_style('whitegrid') # Use white background with grid lines style for all plots
plt.rcParams['figure.figsize'] = (10, 6) # Make every new figure 10*6 inches by

print("Libraries successfully loaded!")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

Libraries successfully loaded!
NumPy version: 2.4.2
Pandas version: 3.0.0


### Load dataset

In [None]:
# Load raw data
print("="*60)
print("LOADING DATA...")
print("="*60)

df_raw = pd.read_csv('../data/ds_salaries.csv')
print(f"Loaded {len(df_raw)} records")
print(f"Columns: {list(df_raw.columns)}")
print(f"\nFirst 5 rows:")
df_raw.head()


LOADING DATA...
Loaded 607 records
Columns: ['Unnamed: 0', 'work_year', 'experience_level', 'employment_type', 'job_title', 'salary', 'salary_currency', 'salary_in_usd', 'employee_residence', 'remote_ratio', 'company_location', 'company_size']

First 5 rows:


Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


### Clean Raw Data

In [9]:
# Clean and prepare data

# Select relevant columns
df = df_raw[['work_year', 'experience_level', 'employment_type', 
             'salary_in_usd', 'remote_ratio', 'company_size']].copy()

# Map experience level to numeric years
# EN (Entry) → 1 year, MI (Mid) → 4 years, SE (Senior) → 8 years, EX (Executive) → 12 years
experience_mapping = {
    'EN': 1,   # Entry-level
    'MI': 4,   # Mid-level
    'SE': 8,   # Senior
    'EX': 12   # Executive
}

df['experience_years'] = df['experience_level'].map(experience_mapping)

# Keep only Full time employees
df = df[df['employment_type'] == 'FT'].copy()

# Select final features - simple univariate regression
df_model = df[['experience_years', 'salary_in_usd']].copy()

# Rename column names
df_model.columns = ['years_experience', 'salary']

# Remove missing values
df_model = df_model.dropna()

#  Remove outliers - unrealistic salary ranges
df_model = df_model[
    (df_model['salary'] >= 20000) &
    (df_model['salary'] <= 500000)
]

# Reset index
df_model = df_model.reset_index(drop=True)

print(f"\nCleaned dataset: {len(df_model)} records")
print(f"Features: {list(df_model.columns)}")
print(f"Salary range: ${df_model['salary'].min():,.0f} - ${df_model['salary'].max():,.0f}")
print(f"Experience range: {df_model['years_experience'].min():.0f} - {df_model['years_experience'].max():.0f} years")

print("\nFirst 10 rows:")
df_model.head(10)


Cleaned dataset: 563 records
Features: ['years_experience', 'salary']
Salary range: $20,000 - $450,000
Experience range: 1 - 12 years

First 10 rows:


Unnamed: 0,years_experience,salary
0,4,79833
1,8,260000
2,8,109024
3,4,20000
4,8,150000
5,1,72000
6,8,190000
7,4,35735
8,4,135000
9,8,125000
