In [8]:
import pandas as pd
import numpy as np
from io import StringIO

# Complete dataset as a string
csv_data = """feature1,feature2,feature3,numerical_with_nan,categorical_feature
1.0,2.5,10.2,5.5,A
1.5,3.0,11.5,NaN,B
2.0,3.5,9.8,5.0,A
2.5,4.0,12.1,6.5,C
3.0,4.5,10.7,NaN,B
3.5,5.0,11.9,6.2,A
4.0,5.5,9.5,4.8,C
4.5,6.0,12.5,7.0,B
5.0,6.5,10.0,NaN,A
5.5,7.0,11.0,6.0,C
"""

# Load the dataset from the string into a Pandas DataFrame
df = pd.read_csv(StringIO(csv_data))

# --- Check for Missing Values ---
print("Number of missing values before imputation:")
print(df.isnull().sum())
print("\n")

# Identify numerical columns for median imputation
numerical_cols = df.select_dtypes(include=np.number).columns

# --- Handle Missing Values by Imputing the Median ---
for col in numerical_cols:
    median_value = df[col].median()
    df[col].fillna(median_value, inplace=True)
    print(f"Missing values in column '{col}' imputed with median: {median_value}")

print("\n")

# --- Verify Missing Values After Imputation ---
print("Number of missing values after imputation:")
print(df.isnull().sum())
print("\n")

# Display the first few rows of the cleaned DataFrame
print("First 5 rows of the DataFrame after imputation:")
print(df.head())

# --- Optional: Save the cleaned DataFrame to a new CSV file ---
# df.to_csv('cleaned_data.csv', index=False)
# print("\nCleaned DataFrame saved to 'cleaned_data.csv'")

Number of missing values before imputation:
feature1               0
feature2               0
feature3               0
numerical_with_nan     3
categorical_feature    0
dtype: int64


Missing values in column 'feature1' imputed with median: 3.25
Missing values in column 'feature2' imputed with median: 4.75
Missing values in column 'feature3' imputed with median: 10.85
Missing values in column 'numerical_with_nan' imputed with median: 6.0


Number of missing values after imputation:
feature1               0
feature2               0
feature3               0
numerical_with_nan     0
categorical_feature    0
dtype: int64


First 5 rows of the DataFrame after imputation:
   feature1  feature2  feature3  numerical_with_nan categorical_feature
0       1.0       2.5      10.2                 5.5                   A
1       1.5       3.0      11.5                 6.0                   B
2       2.0       3.5       9.8                 5.0                   A
3       2.5       4.0      12.1      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_value, inplace=True)
