In [1]:
#StandardScaler

In [3]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Sample data: [age, salary]
data = np.array([[25, 50000], [45, 120000], [65, 80000]])

# 1. Initialize the scaler
scaler = StandardScaler()

# 2. Fit the scaler to the data (it learns the mean and standard deviation)
scaler.fit(data)

# 3. Transform the data
scaled_data = scaler.transform(data)

print("Original Data:\n", data)
print("\nScaled Data:\n", scaled_data)

Original Data:
 [[    25  50000]
 [    45 120000]
 [    65  80000]]

Scaled Data:
 [[-1.22474487 -1.16247639]
 [ 0.          1.27872403]
 [ 1.22474487 -0.11624764]]


In [4]:
#Preprocessing: Encoding Categorical Features

In [5]:
#OneHotEncoder

In [13]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample data: [Color]
data = np.array([['Red'], ['Green'], ['Blue'], ['Green']])

# 1. Initialize the encoder
encoder = OneHotEncoder(sparse_output=False) # sparse=False gives a readable array

# 2. Fit and transform the data
encoded_data = encoder.fit_transform(data)

print("Original Data:\n", data)
print("\nEncoded Data (Columns: Blue, Green, Red):\n", encoded_data)

Original Data:
 [['Red']
 ['Green']
 ['Blue']
 ['Green']]

Encoded Data (Columns: Blue, Green, Red):
 [[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]


In [8]:
#Assembling a Preprocessing Pipeline

In [9]:
# Project: Building a Full Preprocessing Pipeline

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer # To handle missing values
from sklearn.linear_model import LogisticRegression

# Assume 'titanic.csv' is loaded into a pandas DataFrame `df`
# For simplicity, we'll select a few columns and drop rows with missing target
# X = df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]
# y = df['Survived']

# --- Let's create a dummy DataFrame for this example ---
data = {'Pclass': [3, 1, 3], 'Sex': ['male', 'female', 'female'], 'Age': [22.0, 38.0, 26.0], 'Fare': [7.25, 71.28, 7.92], 'Embarked': ['S', 'C', 'S']}
X = pd.DataFrame(data)

# 1. Define which columns are which type
numeric_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

# 2. Create preprocessing steps for each type
# For numeric: impute missing values with the median, then scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# For categorical: impute missing with most frequent, then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# 3. Combine preprocessing steps with a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# 4. Create the final pipeline with a model
# The pipeline will first run the 'preprocessor', then train the 'classifier'
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression())])

print(model_pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'Fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Pclass', 'Sex'