Step 1:- Importing required libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

Step 2:- Loading the dataset

In [2]:
df = pd.read_csv("C:/Users/Aniket Bhoge/Downloads/Compressed/archive/Iris.csv")
print(df.head())

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


Step 3:- Separating the features and target

In [4]:
X = df.drop(columns=["Species"])  # Features
y = df["Species"]                 # Target

Step 4:- Identifying numerical and categorical columns

In [5]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns

print("Numeric columns:", numeric_cols.tolist())
print("Categorical columns:", categorical_cols.tolist())

Numeric columns: ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
Categorical columns: []


Step 5: Creating preprocessing pipelines

In [7]:
from sklearn.impute import SimpleImputer  

numeric_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

Step 6:- Fitting and transforming the data

In [8]:
X_processed = preprocessor.fit_transform(X)
# Converting to df
X_processed_df = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed)
X_processed_df.head()

Unnamed: 0,0,1,2,3,4
0,-1.720542,-0.900681,1.032057,-1.341272,-1.312977
1,-1.697448,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.674353,-1.385353,0.337848,-1.398138,-1.312977
3,-1.651258,-1.506521,0.106445,-1.284407,-1.312977
4,-1.628164,-1.021849,1.26346,-1.341272,-1.312977


Step 7:- Save processed data and pipeline

In [9]:
X_processed_df.to_csv("iris_processed_features.csv", index=False)
y.to_csv("iris_target.csv", index=False)

# Saving the preprocessor
joblib.dump(preprocessor, "iris_preprocessor.pkl")

print("✅ ETL pipeline completed and files saved!")

✅ ETL pipeline completed and files saved!
