In [1]:
# load the necessary libraries
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Extract phase: Load the Iris dataset
iris = load_iris()
# Create a DataFrame with the feature data and column names
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# Add a new column for species, initially using the numeric target
iris_df['species'] = iris.target

In [3]:
# Transform phase: Prepare the data for machine learning
# Normalize the features using StandardScaler
scaler = StandardScaler()
iris_df[iris.feature_names] = scaler.fit_transform(iris_df[iris.feature_names])

In [4]:
# Map species from numerical to categorical names
species_mapping = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
iris_df['species'] = iris_df['species'].map(species_mapping)
# Convert categorical species data into dummy/indicator variables
iris_df = pd.get_dummies(iris_df, columns=['species'], prefix='type')

In [5]:
# Split the dataset into training (80%) and testing (20%) sets
train_df, test_df = train_test_split(iris_df, test_size=0.2, random_state=42)

In [8]:
# Load phase: Save the transformed data to CSV files
transformed_train_path = r'data/transformed_iris_train.csv'
transformed_test_path = r'data/transformed_iris_test.csv'
# Save training data without the index
train_df.to_csv(transformed_train_path, index=False)
# Save testing data without the index
test_df.to_csv(transformed_test_path, index=False)

In [9]:
# Return paths to the saved files
(transformed_train_path, transformed_test_path)

('data/transformed_iris_train.csv', 'data/transformed_iris_test.csv')