### Typical iris dataset using a simple imputer and a standard scaler before splitting and training

In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

data = load_iris(as_frame=True)
X, y = data.data, data.target

# Added some missing values
import numpy as np
X.iloc[::10, 0] = np.nan

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.iloc[::10, 0] = np.nan


**Task 1:** fit the steps into a pipeline.



In [2]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0


**Task 2:** Use a different model but the same pipeline.

In [3]:
from sklearn.ensemble import RandomForestClassifier

pipe.set_params(model=RandomForestClassifier())
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("Accuracy with Random Forest:", accuracy_score(y_test, y_pred))

Accuracy with Random Forest: 1.0


**Bonus Task:** Create a custom transformer to add Petal Area to the dataset.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class PetalAreaAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['petal_area'] = X['petal length (cm)'] * X['petal width (cm)']
        return X
    
pipe = Pipeline([
    ('petal_area', PetalAreaAdder()),
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=200))
])

pipe.fit(X_train, y_train)
preds = pipe.predict(X_test)
print("Accuracy with custom feature:", accuracy_score(y_test, preds))

Accuracy with custom feature: 1.0


In [5]:
X_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
4,5.0,3.6,1.4,0.2
32,5.2,4.1,1.5,0.1
142,5.8,2.7,5.1,1.9
85,6.0,3.4,4.5,1.6
86,6.7,3.1,4.7,1.5


In [None]:
X_transformed = pipe.named_steps['petal_area'].transform(X_train)

print(X_transformed.head())

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
4                  5.0               3.6                1.4               0.2   
32                 5.2               4.1                1.5               0.1   
142                5.8               2.7                5.1               1.9   
85                 6.0               3.4                4.5               1.6   
86                 6.7               3.1                4.7               1.5   

     petal_area  
4          0.28  
32         0.15  
142        9.69  
85         7.20  
86         7.05  
