In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
# 1. EXTRACT
# Read the CSV file
df = pd.read_csv('Iris.csv')

# 2. TRANSFORM
# Drop ID column (usually not needed for analysis)
df = df.drop('Id', axis=1)

# Clean data - check for missing values
print(f"Missing values: {df.isnull().sum().sum()}")

# Rename column for consistency
df = df.rename(columns={'Species': 'species'})

# Create a feature - ratio of sepal length to width
df['sepal_ratio'] = df['SepalLengthCm'] / df['SepalWidthCm']

# Create a feature - ratio of petal length to width
df['petal_ratio'] = df['PetalLengthCm'] / df['PetalWidthCm']

# Clean up species names (remove 'Iris-' prefix)
df['species'] = df['species'].str.replace('Iris-', '')

# Scale numeric features (simplified)
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
df_scaled = df.copy()
df_scaled[numeric_columns] = StandardScaler().fit_transform(df[numeric_columns])

# Quick data exploration
print(f"Data shape: {df.shape}")
print("\nData types:")
print(df.dtypes)
print("\nData summary:")
print(df.describe())

# Scale numeric features
df_scaled = df.copy()
df_scaled.update(StandardScaler().fit_transform(df.select_dtypes('number')))

# 3. LOAD
# Save transformed data to CSV
df.to_csv('iris_transformed.csv', index=False)

# Save scaled data to CSV
scaled_df.to_csv('iris_scaled.csv', index=False)

print("ETL process completed successfully!")

Missing values: 0
Data shape: (150, 7)

Data types:
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
species           object
sepal_ratio      float64
petal_ratio      float64
dtype: object

Data summary:
       SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  sepal_ratio  \
count     150.000000    150.000000     150.000000    150.000000   150.000000   
mean        5.843333      3.054000       3.758667      1.198667     1.955144   
std         0.828066      0.433594       1.764420      0.763161     0.398697   
min         4.300000      2.000000       1.000000      0.100000     1.268293   
25%         5.100000      2.800000       1.600000      0.300000     1.551915   
50%         5.800000      3.000000       4.350000      1.300000     2.032292   
75%         6.400000      3.300000       5.100000      1.800000     2.224910   
max         7.900000      4.400000       6.900000      2.500000     2.961538   

       petal_ratio  
co

In [None]:
df.describe()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,sepal_ratio,petal_ratio
count,150.0,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,1.955144,4.367166
std,0.828066,0.433594,1.76442,0.763161,0.398697,2.651508
min,4.3,2.0,1.0,0.1,1.268293,2.125
25%,5.1,2.8,1.6,0.3,1.551915,2.802381
50%,5.8,3.0,4.35,1.3,2.032292,3.3
75%,6.4,3.3,5.1,1.8,2.22491,4.666667
max,7.9,4.4,6.9,2.5,2.961538,15.0


In [None]:
df.shape

(150, 7)