In [10]:
# Assignment 2: Data Cleaning
# Bisma - Data Science Assignment

# =============================
# 1. Import Libraries
# =============================
import pandas as pd
import numpy as np

# =============================
# 2. Load Dataset
# =============================
# Apna dataset ka file path yahan lagao (CSV format)
# Example: Titanic.csv / Iris.csv / apka project dataset
# df = pd.read_csv("your_dataset.csv") # Commented out the line causing the error

print("🔹 BEFORE CLEANING REPORT")
print("Shape of dataset:", df.shape)
print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicate rows:", df.duplicated().sum())
print("\nDataset Preview (first 5 rows):\n", df.head())

# =============================
# 3. Remove Duplicates
# =============================
df = df.drop_duplicates()

# =============================
# 4. Handle Missing Values
# =============================
# Option 1: Fill numeric values with mean
for col in df.select_dtypes(include=np.number).columns:
    df[col] = df[col].fillna(df[col].mean())

# Option 2: Fill categorical values with mode
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# =============================
# 5. Treat Outliers (IQR Method)
# =============================
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] >= lower) & (data[column] <= upper)]

# Note: Applying outlier removal to the iris dataset might remove some data points
# that are not true outliers but rather natural variations in the data.
# Consider this when interpreting the results.
for col in df.select_dtypes(include=np.number).columns:
    df = remove_outliers_iqr(df, col)

# =============================
# 6. AFTER CLEANING REPORT
# =============================
print("\n🔹 AFTER CLEANING REPORT")
print("Shape of dataset:", df.shape)
print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicate rows:", df.duplicated().sum())
print("\nDataset Preview (first 5 rows):\n", df.head())

# =============================
# 7. Save Cleaned Dataset
# =============================
df.to_csv("cleaned_dataset.csv", index=False)
print("\n✅ Cleaned dataset saved as 'cleaned_dataset.csv'")

🔹 BEFORE CLEANING REPORT
Shape of dataset: (145, 5)

Missing values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

Duplicate rows: 0

Dataset Preview (first 5 rows):
    sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

🔹 AFTER CLEANING REPORT
Shape of dataset: (145, 5)

Missing values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

Duplicate rows: 0

Dataset Preview (first 5 rows):
    sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0        

In [11]:
print("Shape of dataframe:", df.shape)
print("\nFirst 5 rows of dataset:")
print(df.head())

Shape of dataframe: (145, 5)

First 5 rows of dataset:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
