In [7]:
import pandas as pd
import numpy as np

# Load the Iris dataset again
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris_df = pd.read_csv(url, header=None, names=column_names)

# --- Introduce some missing values for demonstration ---
# Randomly select some indices to make values NaN
np.random.seed(42) # for reproducibility
missing_indices = np.random.choice(iris_df.index, size=15, replace=False)
for index in missing_indices:
    random_column = np.random.choice(iris_df.columns[:-1]) # Exclude the 'class' column
    iris_df.loc[index, random_column] = np.nan

# --- Data Cleaning using Pandas ---

# 1. Check for missing values
print("Number of missing values before imputation:")
print(iris_df.isnull().sum())
print("\n")

# 2. Impute missing values with the median of each column
for column in iris_df.columns[:-1]: # Iterate through numerical columns
    median_value = iris_df[column].median()
    iris_df[column].fillna(median_value, inplace=True)

# 3. Verify that missing values have been handled
print("Number of missing values after imputation:")
print(iris_df.isnull().sum())
print("\n")

# Display the first few rows of the DataFrame after imputation
print("First 10 rows of the Iris dataset after imputation:")
print(iris_df.head(10))

Number of missing values before imputation:
sepal_length    3
sepal_width     2
petal_length    7
petal_width     3
class           0
dtype: int64


Number of missing values after imputation:
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
class           0
dtype: int64


First 10 rows of the Iris dataset after imputation:
   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
5           5.4          3.9           1.7          0.4  Iris-setosa
6           4.6          3.4           1.4          0.3  Iris-setosa
7           5.0          3.4           1.5          0.2  Iris-setosa
8           4.4          2.9  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  iris_df[column].fillna(median_value, inplace=True)
