# 🧼 Day 2: Data Cleaning & Understanding

In [None]:

import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("diabetes.csv")
df.head()


## 🔍 Basic Info and Description

In [None]:

# Check data types and missing values
print(df.info())

# Describe dataset statistically
df.describe()


## 🚨 Check for Missing or Invalid (0) Values

In [None]:

# Columns where 0 is likely an invalid entry
cols_with_zero_invalid = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

# Count zero values in these columns
for col in cols_with_zero_invalid:
    print(f"{col}: {(df[col] == 0).sum()} zeros")


## 🔧 Replace 0s with NaN and Impute with Median

In [None]:

# Replace 0s with NaN
df[cols_with_zero_invalid] = df[cols_with_zero_invalid].replace(0, np.nan)

# Impute NaN with median
for col in cols_with_zero_invalid:
    df[col].fillna(df[col].median(), inplace=True)

# Confirm there are no NaNs left
df.isnull().sum()


## ⚖️ Check Class Balance

In [None]:

# Check distribution of outcome (0 = No Diabetes, 1 = Diabetes)
df['Outcome'].value_counts()


## 💾 Save Cleaned Dataset

In [None]:

df.to_csv("diabetes_cleaned.csv", index=False)


## 📊 Optional: Histogram and Heatmap

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Histograms
df.hist(figsize=(12, 10))
plt.tight_layout()
plt.show()

# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()
