In [None]:
# Cell 1: Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Display settings
pd.set_option('display.max_columns', None)

In [None]:
# Cell 2: Load the dataset
df = pd.read_csv("Diabetes.csv")   # Replace with correct path if needed
df.head()

In [None]:
# Cell 3: Basic dataset overview
df.info()
df.describe()

In [None]:
# Cell 4: Data Cleaning
# Check for null values
print("Missing values:\n", df.isnull().sum())

# Replace zero values in certain columns with NaN (common in Diabetes dataset)
cols_with_zeros = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)

# Impute missing values with mean
df.fillna(df.mean(), inplace=True)

df.isnull().sum()

In [None]:
# Cell 5: Handling categorical data (if present)
# In Diabetes dataset, 'Outcome' is categorical (0 = No Diabetes, 1 = Diabetes)
df['Outcome'] = df['Outcome'].astype('category')

df['Outcome'].value_counts()

In [None]:
# Cell 6: Univariate Analysis
plt.figure(figsize=(10,6))
sns.countplot(x="Outcome", data=df)
plt.title("Distribution of Diabetes Outcome (0 = No, 1 = Yes)")
plt.show()

# Histogram for numerical columns
df.hist(bins=20, figsize=(15,10))
plt.show()

In [None]:
# Cell 7: Bi-variate Analysis
# Correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Pairplot for visualization
sns.pairplot(df, hue="Outcome")
plt.show()

# Boxplot of Glucose by Outcome
plt.figure(figsize=(6,4))
sns.boxplot(x="Outcome", y="Glucose", data=df)
plt.title("Glucose levels vs Outcome")
plt.show()