In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
X,y=load_iris(return_X_y=True)

In [3]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)


In [None]:
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [5]:
import numpy as np
(X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)

array([[ 0.51801684,  0.60974056,  1.24855808,  1.66896979],
       [-0.58982115,  2.09289329, -1.2170216 , -1.09982802],
       [-1.45147292,  0.36254844, -1.44637785, -1.36352305],
       [-1.08219359,  0.85693269, -1.33169972, -1.36352305],
       [-1.57456603,  1.35131693, -1.61839503, -1.36352305],
       [ 1.2565755 ,  0.36254844,  1.07654089,  1.40527476],
       [ 0.39492373,  0.85693269,  0.90452371,  1.40527476],
       [-1.57456603,  0.11535632, -1.33169972, -1.36352305],
       [ 1.37966861,  0.36254844,  0.50315027,  0.21864713],
       [ 0.14873751,  0.85693269,  0.38847215,  0.48234216],
       [ 0.88729617, -0.1318358 ,  0.33113308,  0.21864713],
       [-1.08219359,  0.85693269, -1.27436066, -1.09982802],
       [ 2.24132038, -0.62622004,  1.64993152,  1.00973222],
       [ 0.51801684, -0.62622004,  0.73250652,  0.35049464],
       [ 0.27183062, -0.62622004,  0.50315027, -0.0450479 ],
       [ 1.2565755 ,  0.11535632,  0.90452371,  1.14157973],
       [-0.22054182, -0.

In [6]:
X_train_scaled
# 1️⃣ StandardScaler standardizes numeric features by removing the mean and scaling to unit variance (mean=0, std=1).
# 2️⃣ Formula: z = (x - μ) / σ, where μ is the mean and σ is the standard deviation of each feature.
# 3️⃣ It ensures all features contribute equally, improving model stability and training performance.
# 4️⃣ Always fit the scaler on training data, then apply the same transformation to the test data.
# 5️⃣ Not required for tree-based models (like Random Forest or Decision Tree) since they are scale-invariant.


array([[ 0.51801684,  0.60974056,  1.24855808,  1.66896979],
       [-0.58982115,  2.09289329, -1.2170216 , -1.09982802],
       [-1.45147292,  0.36254844, -1.44637785, -1.36352305],
       [-1.08219359,  0.85693269, -1.33169972, -1.36352305],
       [-1.57456603,  1.35131693, -1.61839503, -1.36352305],
       [ 1.2565755 ,  0.36254844,  1.07654089,  1.40527476],
       [ 0.39492373,  0.85693269,  0.90452371,  1.40527476],
       [-1.57456603,  0.11535632, -1.33169972, -1.36352305],
       [ 1.37966861,  0.36254844,  0.50315027,  0.21864713],
       [ 0.14873751,  0.85693269,  0.38847215,  0.48234216],
       [ 0.88729617, -0.1318358 ,  0.33113308,  0.21864713],
       [-1.08219359,  0.85693269, -1.27436066, -1.09982802],
       [ 2.24132038, -0.62622004,  1.64993152,  1.00973222],
       [ 0.51801684, -0.62622004,  0.73250652,  0.35049464],
       [ 0.27183062, -0.62622004,  0.50315027, -0.0450479 ],
       [ 1.2565755 ,  0.11535632,  0.90452371,  1.14157973],
       [-0.22054182, -0.

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)


# 1️⃣ MinMaxScaler scales features to a fixed range, usually [0, 1].
# 2️⃣ Formula: X_scaled = (X - X_min) / (X_max - X_min).
# 3️⃣ Preserves the shape of the original distribution.
# 4️⃣ Useful for algorithms sensitive to feature magnitude (e.g., KNN, Neural Networks).
# 5️⃣ Fit on training data, then transform both train and test sets.

In [8]:
X_train_scaled

array([[0.58823529, 0.54166667, 0.84745763, 1.        ],
       [0.32352941, 0.79166667, 0.11864407, 0.125     ],
       [0.11764706, 0.5       , 0.05084746, 0.04166667],
       [0.20588235, 0.58333333, 0.08474576, 0.04166667],
       [0.08823529, 0.66666667, 0.        , 0.04166667],
       [0.76470588, 0.5       , 0.79661017, 0.91666667],
       [0.55882353, 0.58333333, 0.74576271, 0.91666667],
       [0.08823529, 0.45833333, 0.08474576, 0.04166667],
       [0.79411765, 0.5       , 0.62711864, 0.54166667],
       [0.5       , 0.58333333, 0.59322034, 0.625     ],
       [0.67647059, 0.41666667, 0.57627119, 0.54166667],
       [0.20588235, 0.58333333, 0.10169492, 0.125     ],
       [1.        , 0.33333333, 0.96610169, 0.79166667],
       [0.58823529, 0.33333333, 0.69491525, 0.58333333],
       [0.52941176, 0.33333333, 0.62711864, 0.45833333],
       [0.76470588, 0.45833333, 0.74576271, 0.83333333],
       [0.41176471, 0.41666667, 0.54237288, 0.45833333],
       [0.20588235, 0.66666667,

In [9]:
X_min=np.min(X_train, axis=0)
X_max=np.max(X_train, axis=0)

(X_train - X_min) / (X_max - X_min)
# this gives same output as X_train_scaled but in numpy and less efficient way

array([[0.58823529, 0.54166667, 0.84745763, 1.        ],
       [0.32352941, 0.79166667, 0.11864407, 0.125     ],
       [0.11764706, 0.5       , 0.05084746, 0.04166667],
       [0.20588235, 0.58333333, 0.08474576, 0.04166667],
       [0.08823529, 0.66666667, 0.        , 0.04166667],
       [0.76470588, 0.5       , 0.79661017, 0.91666667],
       [0.55882353, 0.58333333, 0.74576271, 0.91666667],
       [0.08823529, 0.45833333, 0.08474576, 0.04166667],
       [0.79411765, 0.5       , 0.62711864, 0.54166667],
       [0.5       , 0.58333333, 0.59322034, 0.625     ],
       [0.67647059, 0.41666667, 0.57627119, 0.54166667],
       [0.20588235, 0.58333333, 0.10169492, 0.125     ],
       [1.        , 0.33333333, 0.96610169, 0.79166667],
       [0.58823529, 0.33333333, 0.69491525, 0.58333333],
       [0.52941176, 0.33333333, 0.62711864, 0.45833333],
       [0.76470588, 0.45833333, 0.74576271, 0.83333333],
       [0.41176471, 0.41666667, 0.54237288, 0.45833333],
       [0.20588235, 0.66666667,

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import fetch_openml

data = fetch_openml('adult', as_frame=True).frame


- version 1, status: active
  url: https://www.openml.org/search?type=data&id=179
- version 2, status: active
  url: https://www.openml.org/search?type=data&id=1590



In [13]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States,<=50K
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States,<=50K
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,2,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,2,United-States,<=50K
48838,4,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,2,United-States,<=50K
48839,2,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,3,United-States,<=50K
48840,2,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,2,0,2,United-States,<=50K


In [14]:
data.occupation.value_counts()


occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64

In [18]:
import pandas as pd
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded_values=encoder.fit_transform(data[['occupation','race']])
new_cols=encoder.get_feature_names_out(['occupation','race'])


In [19]:
new_cols 

array(['occupation_Adm-clerical', 'occupation_Armed-Forces',
       'occupation_Craft-repair', 'occupation_Exec-managerial',
       'occupation_Farming-fishing', 'occupation_Handlers-cleaners',
       'occupation_Machine-op-inspct', 'occupation_Other-service',
       'occupation_Priv-house-serv', 'occupation_Prof-specialty',
       'occupation_Protective-serv', 'occupation_Sales',
       'occupation_Tech-support', 'occupation_Transport-moving',
       'occupation_nan', 'race_Amer-Indian-Eskimo',
       'race_Asian-Pac-Islander', 'race_Black', 'race_Other',
       'race_White'], dtype=object)

In [20]:
df_encoded=pd.DataFrame(encoded_values, columns=new_cols,index=data.index)

In [22]:
data_final = pd.concat(
    [data.drop(columns=['occupation', 'race']), df_encoded],
    axis=1
)


In [23]:
data_final

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,relationship,sex,capitalgain,capitalloss,...,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,occupation_nan,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,2,State-gov,77516,Bachelors,13,Never-married,Not-in-family,Male,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Husband,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,Private,215646,HS-grad,9,Divorced,Not-in-family,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,Private,234721,11th,7,Married-civ-spouse,Husband,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Wife,Female,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,2,Private,215419,Bachelors,13,Divorced,Not-in-family,Female,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
48838,4,,321403,HS-grad,9,Widowed,Other-relative,Male,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
48839,2,Private,374983,Bachelors,13,Married-civ-spouse,Husband,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
48840,2,Private,83891,Bachelors,13,Divorced,Own-child,Male,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
