# üß† Stroke Prediction ‚Äì Exploratory Data Analysis (EDA)
This notebook performs a complete EDA on the Stroke Prediction dataset from Kaggle.
It includes data cleaning, visualization, and statistical summaries.


## 1. Import Libraries

In [None]:
import os, sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (9,5)


## 2. Load Dataset

In [None]:
def autodetect_csv():
    for f in os.listdir():
        if f.lower().endswith('.csv') and 'stroke' in f.lower():
            return f
    for f in os.listdir():
        if f.lower().endswith('.csv'):
            return f
    return None

csv_path = autodetect_csv()
if not csv_path:
    raise FileNotFoundError('‚ùå CSV file not found in directory.')
print(f'‚úÖ Using CSV: {csv_path}')
df = pd.read_csv(csv_path)
df.head()

## 3. Data Cleaning and Preprocessing

In [None]:
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]

for col in ['age', 'avg_glucose_level', 'bmi']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

if 'bmi' in df.columns and df['bmi'].isna().any():
    df['bmi'] = df['bmi'].fillna(df['bmi'].median())

for id_candidate in ['id', 'patient_id']:
    if id_candidate in df.columns:
        df.drop(columns=[id_candidate], inplace=True)

if 'stroke' in df.columns:
    df['stroke'] = pd.to_numeric(df['stroke'], errors='coerce').fillna(0).astype(int)

if 'age' in df.columns:
    df['age_group'] = pd.cut(
        df['age'], bins=[0,30,45,60,200],
        labels=['<30', '30‚Äì45', '45‚Äì60', '60+'],
        include_lowest=True
    )
df.info()

## 4. Visualizations

In [None]:
# Gender Distribution
if 'gender' in df.columns:
    sns.countplot(x='gender', data=df, order=df['gender'].value_counts().index)
    plt.title('Number of Patients by Gender')
    plt.show()

# Age Distribution
if 'age' in df.columns:
    sns.histplot(df['age'].dropna(), bins=25, kde=True)
    plt.title('Age Distribution')
    plt.show()

# Stroke Rate by Gender
if 'stroke' in df.columns and 'gender' in df.columns:
    rate_by_gender = df.groupby('gender')['stroke'].mean().sort_values(ascending=False)
    sns.barplot(x=rate_by_gender.index, y=rate_by_gender.values)
    plt.title('Stroke Rate by Gender')
    plt.show()

# Stroke Rate by Age Group
if 'stroke' in df.columns and 'age_group' in df.columns:
    rate_by_age = df.groupby('age_group')['stroke'].mean()
    sns.barplot(x=rate_by_age.index, y=rate_by_age.values)
    plt.title('Stroke Rate by Age Group')
    plt.show()

# Glucose Level by Stroke Outcome
if 'avg_glucose_level' in df.columns and 'stroke' in df.columns:
    sns.kdeplot(data=df, x='avg_glucose_level', hue='stroke', common_norm=False)
    plt.title('Glucose Level by Stroke Outcome')
    plt.show()

# BMI by Stroke Outcome
if 'bmi' in df.columns and 'stroke' in df.columns:
    sns.kdeplot(data=df, x='bmi', hue='stroke', common_norm=False)
    plt.title('BMI by Stroke Outcome')
    plt.show()

# Stroke Rate by Smoking Status
if 'smoking_status' in df.columns and 'stroke' in df.columns:
    order = df['smoking_status'].value_counts().index
    sns.barplot(x='smoking_status', y='stroke', data=df, order=order, estimator=np.mean)
    plt.title('Stroke Rate by Smoking Status')
    plt.xticks(rotation=20)
    plt.show()

## 5. Correlation Analysis

In [None]:
num = df.select_dtypes(include=[np.number])
if not num.empty:
    corr = num.corr()
    corr_pairs = corr.unstack().sort_values(key=np.abs, ascending=False)
    corr_pairs = corr_pairs[corr_pairs != 1]
    top_corr = corr_pairs.drop_duplicates().head(5)

    print('Top 5 strongest numeric relationships:')
    for (a, b), v in top_corr.items():
        print(f' - {a} ‚Üî {b}: r = {v:.2f}')

    strong = corr[(corr > 0.5) | (corr < -0.5)]
    if strong.notna().any().any():
        sns.heatmap(strong, annot=True, cmap='coolwarm', fmt='.2f')
        plt.title('Simplified Correlation (|r| > 0.5)')
        plt.show()

## 6. Summary Notes

In [None]:
print('‚úÖ EDA Complete! All plots displayed inline.')