In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
df = pd.read_csv("diabetes.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

# Data Exploration
* Duplication
* Missing Values
* Columns data type
* Summary statistics 
* Outliers 

* Duplicationrs 

In [None]:
print('duplicate values : ' , df.duplicated().sum())

* Missing Values

In [None]:
print('null values : \n' , df.isnull().sum())

* Columns data type

In [None]:
df.dtypes

* Outliers 

In [None]:
def remove_outliers_iqr(df):
    df_cleaned = df.copy()       
    for column in df_cleaned.columns:
        Q1 = df_cleaned[column].quantile(0.25)
        Q3 = df_cleaned[column].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        df_cleaned = df_cleaned[ (df_cleaned[column] >= lower_bound) & (df_cleaned[column] <= upper_bound) ]
    return df_cleaned


df_cleaned = remove_outliers_iqr(df)

print(f"Original number of records : {df.shape[0]}")
print(f"Number of records after removing outliers : {df_cleaned.shape[0]}")


In [None]:
def remove_outliers_iqr(df):
    df_cleaned = df.copy()       
    for column in df_cleaned.columns:
        Q1 = df_cleaned[column].quantile(0.25)
        Q3 = df_cleaned[column].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        df_cleaned = df_cleaned[ (df_cleaned[column] >= lower_bound) & (df_cleaned[column] <= upper_bound) ]
    return df_cleaned


df_cleaned = remove_outliers_iqr(df)

print(f"Original number of records : {df.shape[0]}")
print(f"Number of records after removing outliers : {df_cleaned.shape[0]}")

In [None]:
df_cleaned.head()

In [None]:
df_cleaned.to_csv('cleaned_data.csv', index=False)

# Data Visualizations

In [None]:
plt.figure(figsize=(5, 4))

sns.countplot(x='Outcome', data=df)
plt.title('Distribution of Outcome Variable')
plt.show()


In [None]:
fig = px.histogram(df, x='Glucose', title='Distribution of Glucose Levels')
fig.show()

In [None]:
sns.pairplot(df[['Glucose', 'BMI', 'Age', 'Insulin']])
plt.show()


In [None]:
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()


In [None]:
average_bmi = df.groupby('Outcome')['BMI'].mean().reset_index()

plt.figure(figsize=(10, 8))

fig = px.bar(average_bmi, x='Outcome', y='BMI',
             title='Average BMI by Outcome',
             labels={'Outcome': 'Diabetes Outcome', 'BMI': 'Average BMI'})

fig.update_traces(text=average_bmi['BMI'], textposition='outside')

fig.update_layout(
    xaxis_title = 'Diabetes Outcome',
    yaxis_title = 'Average BMI',
    title       = 'Average BMI by Outcome',
    height      =  600,
    width       =  600 
)

fig.show()


In [None]:
average_bmi = df.groupby('Outcome')['Age'].mean().reset_index()

plt.figure(figsize=(10, 8))

fig = px.bar(average_bmi, x='Outcome', y='Age',
             title='Average Age by Outcome',
             labels={'Outcome': 'Diabetes Outcome', 'Age': 'Average Age'})

fig.update_traces(text=average_bmi['Age'], textposition='outside')

fig.update_layout(
    xaxis_title = 'Diabetes Outcome',
    yaxis_title = 'Average Age',
    title       = 'Average Age by Outcome',
    height      =  600,
    width       =  600 
)

fig.show()


In [None]:
fig = px.bar(df, x='Age', y="BMI", title='BMI By Age',
              color_discrete_sequence=px.colors.qualitative.Dark2)

fig.update_layout(
    template='seaborn')

fig.show()