In [1]:
# task1_data_pipeline.py

import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
from scipy.stats import zscore

# Step 1: Load Dataset (Iris Dataset)
iris = datasets.load_iris()
data = pd.DataFrame(data=iris.data, columns=iris.feature_names)
data['target'] = iris.target
data['species'] = iris.target_names[iris.target]

print("Original Data Head:")
print(data.head())

# Step 2: Handle Missing Values (Simulated)
data.loc[5:10, 'sepal length (cm)'] = np.nan
print("\nMissing values before filling:")
print(data.isnull().sum())

data['sepal length (cm)'].fillna(data['sepal length (cm)'].mean(), inplace=True)

# Step 3: Outlier Detection (Z-score method)
z_scores = np.abs(zscore(data.select_dtypes(include=[np.number])))
outliers = (z_scores > 3)
print("\nNumber of outliers (Z-score > 3):")
print(outliers.sum())

# Step 4: Encode Categorical Data
label_encoder = LabelEncoder()
data['species_encoded'] = label_encoder.fit_transform(data['species'])

# Step 5: Feature Scaling
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[features])

scaled_df = pd.DataFrame(data_scaled, columns=[f"{col}_scaled" for col in features])
data = pd.concat([data.reset_index(drop=True), scaled_df], axis=1)

# Step 6: Save Cleaned Data
data.to_csv("cleaned_iris_data.csv", index=False)
print("\nCleaned data saved to 'cleaned_iris_data.csv'")

Original Data Head:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target species  
0       0  setosa  
1       0  setosa  
2       0  setosa  
3       0  setosa  
4       0  setosa  

Missing values before filling:
sepal length (cm)    6
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
species              0
dtype: int64

Number of outliers (Z-score > 3):
1

Cleaned data saved to 'cleaned_iris_data.csv'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['sepal length (cm)'].fillna(data['sepal length (cm)'].mean(), inplace=True)
