# Standardize or Normalize Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
dataset = pd.read_csv('Dataset salary 2024.csv')

In [3]:
dataset.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2024,SE,FT,AI Engineer,202730,USD,202730,US,0,US,M
1,2024,SE,FT,AI Engineer,92118,USD,92118,US,0,US,M
2,2024,SE,FT,Data Engineer,130500,USD,130500,US,0,US,M
3,2024,SE,FT,Data Engineer,96000,USD,96000,US,0,US,M
4,2024,SE,FT,Machine Learning Engineer,190000,USD,190000,US,0,US,M


In [4]:
# Identify numerical columns
numerical_columns = dataset.select_dtypes(include=[np.number]).columns

# Standardize Data (Z-score normalization)

In [7]:
# Initialize the StandardScaler
scaler = StandardScaler()

In [8]:
# Fit and transform the numerical columns
df_standardized = dataset.copy()
df_standardized[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])


### fit method

### Transform

In [9]:
print("\nStandardized dataset:")
df_standardized.head()


Standardized dataset:


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,1.083524,SE,FT,AI Engineer,0.114649,USD,0.774317,US,-0.692066,US,M
1,1.083524,SE,FT,AI Engineer,-0.210494,USD,-0.840381,US,-0.692066,US,M
2,1.083524,SE,FT,Data Engineer,-0.09767,USD,-0.280086,US,-0.692066,US,M
3,1.083524,SE,FT,Data Engineer,-0.199083,USD,-0.783712,US,-0.692066,US,M
4,1.083524,SE,FT,Machine Learning Engineer,0.077229,USD,0.588487,US,-0.692066,US,M


# Normalize Data (Min-Max scaling)

In [12]:
# Initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

In [13]:
# Fit and transform the numerical columns
df_normalized = dataset.copy()
df_normalized[numerical_columns] = min_max_scaler.fit_transform(dataset[numerical_columns])

In [14]:
print("\nNormalized dataset:")
df_normalized.head()


Normalized dataset:


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,1.0,SE,FT,AI Engineer,0.006211,USD,0.239146,US,0.0,US,M
1,1.0,SE,FT,AI Engineer,0.002571,USD,0.098239,US,0.0,US,M
2,1.0,SE,FT,Data Engineer,0.003834,USD,0.147134,US,0.0,US,M
3,1.0,SE,FT,Data Engineer,0.002699,USD,0.103185,US,0.0,US,M
4,1.0,SE,FT,Machine Learning Engineer,0.005792,USD,0.22293,US,0.0,US,M


# Z-Score
Centers the data around 0 with a standard deviation of 1.

# Min-Max 
Scales the data within a specified range, usually [0, 1].