In [None]:
# Loading of Datasets and Initial Inspection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Load the dataset
import pandas as pd

thankgod_israel = pd.read_csv(
    r'C:\Users\User\Desktop\OSIRI UNIVERSITY Files\diabetes_prediction_dashboard\diabetes_prediction_dataset.csv'
)

# Display basic information
print("Dataset Shape:", thankgod_israel.shape)
print("\nDataset Info:")
thankgod_israel.info()
print("\nFirst 5 Rows:")
print(thankgod_israel.head())
print("\nSummary Statistics:")
print(thankgod_israel.describe())


# Check for missing values
print("Missing Values per Column:")
print(thankgod_israel.isnull().sum())

# Check for duplicates
print(f"\nNumber of Duplicate Rows: {thankgod_israel.duplicated().sum()}")

# Check data types
print("\nData Types:")
print(thankgod_israel.dtypes)

# Check unique values for categorical variables
categorical_cols = ['gender', 'smoking_history', 'hypertension', 'heart_disease', 'diabetes']
for col in categorical_cols:
    print(f"\n{col} unique values: {thankgod_israel[col].unique()}")
    print(f"Counts:\n{thankgod_israel[col].value_counts()}")


# Exploratory Data Analysis
# Setting up visualization for various Views
plt.style.use('seaborn-v0_8-darkgrid')
fig, axes = plt.subplots(3, 3, figsize=(15, 12))

# Distribution of age
axes[0, 0].hist(thankgod_israel['age'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Age Distribution')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')

# Distribution of BMI
axes[0, 1].hist(thankgod_israel['bmi'], bins=50, edgecolor='black', alpha=0.7, color='green')
axes[0, 1].set_title('BMI Distribution')
axes[0, 1].set_xlabel('BMI')
axes[0, 1].set_ylabel('Frequency')

# Distribution of HbA1c
axes[0, 2].hist(thankgod_israel['HbA1c_level'], bins=50, edgecolor='black', alpha=0.7, color='red')
axes[0, 2].set_title('HbA1c Distribution')
axes[0, 2].set_xlabel('HbA1c Level')
axes[0, 2].set_ylabel('Frequency')

# Distribution of blood glucose
axes[1, 0].hist(thankgod_israel['blood_glucose_level'], bins=50, edgecolor='black', alpha=0.7, color='purple')
axes[1, 0].set_title('Blood Glucose Distribution')
axes[1, 0].set_xlabel('Blood Glucose Level')
axes[1, 0].set_ylabel('Frequency')

# Gender distribution
gender_counts = thankgod_israel['gender'].value_counts()
axes[1, 1].bar(gender_counts.index, gender_counts.values, color=['pink', 'lightblue', 'gray'])
axes[1, 1].set_title('Gender Distribution')
axes[1, 1].set_xlabel('Gender')
axes[1, 1].set_ylabel('Count')

# Smoking history distribution
smoking_counts = thankgod_israel['smoking_history'].value_counts()
axes[1, 2].bar(range(len(smoking_counts)), smoking_counts.values,
               tick_label=smoking_counts.index, color='orange')
axes[1, 2].set_title('Smoking History Distribution')
axes[1, 2].set_xlabel('Smoking History')
axes[1, 2].set_ylabel('Count')
plt.xticks(rotation=45)

# Hypertension distribution
hypertension_counts = thankgod_israel['hypertension'].value_counts()
axes[2, 0].bar(['No', 'Yes'], hypertension_counts.values, color=['lightgreen', 'salmon'])
axes[2, 0].set_title('Hypertension Distribution')
axes[2, 0].set_xlabel('Hypertension')
axes[2, 0].set_ylabel('Count')

# Heart disease distribution
heart_disease_counts = thankgod_israel['heart_disease'].value_counts()
axes[2, 1].bar(['No', 'Yes'], heart_disease_counts.values, color=['lightblue', 'coral'])
axes[2, 1].set_title('Heart Disease Distribution')
axes[2, 1].set_xlabel('Heart Disease')
axes[2, 1].set_ylabel('Count')

# Diabetes distribution
diabetes_counts = thankgod_israel['diabetes'].value_counts()
axes[2, 2].bar(['No Diabetes', 'Diabetes'], diabetes_counts.values, color=['lightgreen', 'red'])
axes[2, 2].set_title('Diabetes Distribution')
axes[2, 2].set_xlabel('Diabetes Status')
axes[2, 2].set_ylabel('Count')

plt.tight_layout()
plt.show()


# Correlation matrix
numerical_cols = ['age', 'hypertension', 'heart_disease', 'bmi',
                  'HbA1c_level', 'blood_glucose_level', 'diabetes']
correlation_matrix = thankgod_israel[numerical_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm',
            center=0, fmt='.2f', square=True)
plt.title('Correlation Matrix of Numerical Variables')
plt.show()

# Age vs Diabetes
plt.figure(figsize=(10, 6))
sns.boxplot(x='diabetes', y='age', data=thankgod_israel)
plt.title('Age Distribution by Diabetes Status')
plt.xlabel('Diabetes Status (0=No, 1=Yes)')
plt.ylabel('Age')
plt.show()

# HbA1c vs Diabetes
plt.figure(figsize=(10, 6))
sns.boxplot(x='diabetes', y='HbA1c_level', data=thankgod_israel)
plt.title('HbA1c Level Distribution by Diabetes Status')
plt.xlabel('Diabetes Status (0=No, 1=Yes)')
plt.ylabel('HbA1c Level')
plt.show()

# Blood Glucose vs Diabetes
plt.figure(figsize=(10, 6))
sns.boxplot(x='diabetes', y='blood_glucose_level', data=thankgod_israel)
plt.title('Blood Glucose Distribution by Diabetes Status')
plt.xlabel('Diabetes Status (0=No, 1=Yes)')
plt.ylabel('Blood Glucose Level')
plt.show()

# BMI vs Diabetes
plt.figure(figsize=(10, 6))
sns.boxplot(x='diabetes', y='bmi', data=thankgod_israel)
plt.title('BMI Distribution by Diabetes Status')
plt.xlabel('Diabetes Status (0=No, 1=Yes)')
plt.ylabel('BMI')
plt.show()


# Scatter plot matrix for key variables
from pandas.plotting import scatter_matrix
key_vars = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']
scatter_matrix(thankgod_israel[key_vars], figsize=(12, 10), diagonal='kde')
plt.suptitle('Scatter Plot Matrix of Key Variables', y=1.02)
plt.show()

# Pairplot with diabetes as hue
sns.pairplot(thankgod_israel[['age', 'bmi', 'HbA1c_level',
                              'blood_glucose_level', 'diabetes']],
             hue='diabetes', palette={0: 'blue', 1: 'red'})
plt.suptitle('Pair Plot Colored by Diabetes Status', y=1.02)
plt.show()



