# 01 - Exploratory Data Analysis (EDA)

This notebook is like a place holder or sample for what your are to do on the EDA. Edit as neccessary:
- Loads `heart.xls` from `/mnt/data`
- Cleans and summarizes the dataset
- Produces visualizations (displayed inline) and saves them to `images/`
- Performs simple statistical tests (t-test for numeric vs target, chi-square for categorical vs target)
- Prints short, automatic insights after each analysis step


In [None]:
# Standard imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from IPython.display import display

# Make plots look nicer
%matplotlib inline
sns.set(style='whitegrid')

# Paths
DATA_PATH = '/mnt/data/heart.xls'
IMAGES_DIR = os.path.join('..', 'images') if os.getcwd().endswith('notebooks') else os.path.join('/mnt/data/heart-disease-project', 'images')
os.makedirs(IMAGES_DIR, exist_ok=True)
print('Images will be saved to:', IMAGES_DIR)


In [None]:
# 1. Load the data
df = pd.read_excel(DATA_PATH)
print('Initial shape:', df.shape)
display(df.head())


In [None]:
# 2. Basic cleaning function (lowercase cols, strip whitespace, replace empty strings)
def basic_clean(df):
    df = df.copy()
    df.columns = [str(c).strip().lower().replace(' ', '_') for c in df.columns]
    df = df.drop_duplicates()
    df = df.replace(r'^\s*$', np.nan, regex=True)
    return df

df = basic_clean(df)
print('After basic cleaning shape:', df.shape)
display(df.head())


In [None]:
# Ensure target exists and is binary
if 'target' not in df.columns:
    raise KeyError("The dataset must contain a 'target' column with 0/1 values indicating heart disease.")
print('Target unique values:', df['target'].unique())


## 3. Quick summary & missingness


In [None]:
display(df.info())
display(df.describe(include='all').T)

# Missing values summary
missing = df.isnull().sum().sort_values(ascending=False)
missing = missing[missing>0]
print('\nColumns with missing values:\n')
print(missing)


## 4. Class distribution (target)


In [None]:
ax = df['target'].value_counts().plot(kind='bar')
ax.set_xticklabels(['No Disease (0)','Disease (1)'])
ax.set_ylabel('Count')
ax.set_title('Target class distribution')
plt.tight_layout()
fn = os.path.join(IMAGES_DIR, 'target_distribution.png')
plt.savefig(fn, dpi=150)
plt.show()
print('\nAuto-insight:')
counts = df['target'].value_counts()
if counts.shape[0]==2:
    pct = counts.iloc[1] / counts.sum()
    print(f"Proportion with heart disease: {pct:.2%} (class=1). This indicates whether class imbalance handling may be needed.")
else:
    print('Target is not binary or unexpected unique values.')


## 5. Identify numeric and categorical columns


In [None]:
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
print('Numeric columns:', num_cols)
print('Categorical columns:', cat_cols)


## 6. Univariate analysis — numeric features
For each numeric feature we will plot a histogram and boxplot, save them, and compute skewness.

In [None]:
for col in num_cols:
    fig, axes = plt.subplots(1,2, figsize=(12,4))
    sns.histplot(df[col].dropna(), kde=True, ax=axes[0])
    axes[0].set_title(f'Histogram of {col}')
    sns.boxplot(x=df[col], ax=axes[1])
    axes[1].set_title(f'Boxplot of {col}')
    plt.tight_layout()
    fn = os.path.join(IMAGES_DIR, f'{col}_hist_box.png')
    plt.savefig(fn, dpi=150)
    plt.show()
    skew = df[col].skew()
    print(f"{col} — skewness: {skew:.2f}")


## 7. Bivariate analysis — numeric vs target
Compare distributions of numeric features grouped by `target` and run t-tests (or Mann-Whitney if non-normal).


In [None]:
from scipy.stats import ttest_ind, mannwhitneyu

for col in num_cols:
    data0 = df[df['target']==0][col].dropna()
    data1 = df[df['target']==1][col].dropna()
    # plot
    plt.figure(figsize=(6,4))
    sns.boxplot(x='target', y=col, data=df)
    plt.title(f'{col} by target')
    fn = os.path.join(IMAGES_DIR, f'{col}_by_target_box.png')
    plt.savefig(fn, dpi=150)
    plt.show()
    # choose test based on normality (Shapiro) and sample size
    use_mw = False
    try:
        if len(data0) >= 3 and len(data1) >= 3:
            p0 = stats.shapiro(data0.sample(min(5000, len(data0))))[1] if len(data0) <= 5000 else 1.0
            p1 = stats.shapiro(data1.sample(min(5000, len(data1))))[1] if len(data1) <= 5000 else 1.0
            if p0 < 0.05 or p1 < 0.05:
                use_mw = True
    except Exception:
        use_mw = True
    if use_mw:
        stat, p = mannwhitneyu(data0, data1, alternative='two-sided')
        test_name = 'Mann-Whitney U'
    else:
        stat, p = ttest_ind(data0, data1, nan_policy='omit')
        test_name = 'T-test'
    print(f"{col}: {test_name} p-value = {p:.4f}")
    if p < 0.05:
        print(f"  -> Auto-insight: Significant difference in {col} between classes (p<{0.05}).")
    else:
        print(f"  -> Auto-insight: No significant difference detected for {col} (p={p:.3f}).")


## 8. Categorical features — counts and chi-square tests


In [None]:
from scipy.stats import chi2_contingency
if len(cat_cols)==0:
    print('No categorical columns detected (object/category). If some categorical variables are numeric-coded, consider converting them.')
for col in cat_cols:
    print('\nColumn:', col)
    display(pd.crosstab(df[col], df['target']))
    try:
        table = pd.crosstab(df[col], df['target'])
        if table.size == 0:
            continue
        chi2, p, dof, ex = chi2_contingency(table.fillna(0))
        print(f'Chi-square p-value = {p:.4f}')
        if p < 0.05:
            print('  -> Auto-insight: There is a significant association between', col, 'and target (p<0.05).')
        else:
            print('  -> Auto-insight: No significant association detected (p>=0.05).')
    except Exception as e:
        print('Could not run chi-square test:', e)


## 9. Correlation matrix (numeric features)


In [None]:
corr = df[num_cols].corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation matrix (numeric features)')
fn = os.path.join(IMAGES_DIR, 'correlation_matrix.png')
plt.savefig(fn, dpi=150, bbox_inches='tight')
plt.show()


## 10. Correlation of features with target (point-biserial for numeric)


In [None]:
from scipy.stats import pointbiserialr
corrs = []
for col in num_cols:
    try:
        r, p = pointbiserialr(df['target'].fillna(0), df[col].fillna(df[col].median()))
        corrs.append({'feature': col, 'r': r, 'p': p})
    except Exception as e:
        pass
corr_df = pd.DataFrame(corrs).sort_values('r', key=abs, ascending=False)
display(corr_df)
plt.figure(figsize=(6,4))
sns.barplot(x='r', y='feature', data=corr_df)
plt.title('Point-biserial correlation with target')
fn = os.path.join(IMAGES_DIR, 'feature_target_correlation.png')
plt.savefig(fn, dpi=150, bbox_inches='tight')
plt.show()
print('\nAuto-insight: Features with largest absolute correlation (top 5):')
print(corr_df.head(5).to_string(index=False))


## 11. Simple pairwise plots for top features


In [None]:
top_feats = corr_df['feature'].head(4).tolist()
if len(top_feats) >= 2:
    sns.pairplot(df[top_feats + ['target']].dropna(), hue='target', corner=True)
    fn = os.path.join(IMAGES_DIR, 'pairplot_top_features.png')
    plt.savefig(fn, dpi=150, bbox_inches='tight')
    plt.show()
else:
    print('Not enough top numeric features for pairplot.')


## 12. Outlier detection (simple IQR method) and counts


In [None]:
outlier_counts = {}
for col in num_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    oc = df[(df[col] < lower) | (df[col] > upper)].shape[0]
    outlier_counts[col] = oc
outlier_df = pd.DataFrame.from_dict(outlier_counts, orient='index', columns=['outlier_count']).sort_values('outlier_count', ascending=False)
display(outlier_df)
print('\nAuto-insight: Columns with many outliers may need robust scaling or capping before modeling.')


## 13. Missingness visualization (simple)


In [None]:
ms = df.isnull().sum()
if ms.sum() == 0:
    print('No missing values detected in the dataset.')
else:
    ms = ms[ms>0].sort_values(ascending=False)
    ms.plot.barh(figsize=(6, max(3, len(ms)*0.4)))
    plt.title('Missing values per column')
    fn = os.path.join(IMAGES_DIR, 'missing_values.png')
    plt.savefig(fn, dpi=150, bbox_inches='tight')
    plt.show()


## 14. Save a cleaned version of the dataset for modeling


In [None]:
clean_path = os.path.join('/mnt/data/heart-disease-project', 'data', 'processed', 'heart_clean.csv')
df.to_csv(clean_path, index=False)
print('Saved cleaned data to:', clean_path)


## 15. Final auto-summary of EDA


In [None]:
print('--- EDA SUMMARY ---')
print(f'Total rows: {df.shape[0]}, Total columns: {df.shape[1]}')
print('\nTop numeric features correlated with target:')
display(corr_df.head(10))
print('\nColumns with missing values:')
display(df.isnull().sum()[df.isnull().sum()>0])
print('\nColumns with most outliers:')
display(outlier_df.head(10))
print('\nRecommendations:')
print(' - Handle class imbalance if disease prevalence is low (resampling or class weights).')
print(' - Consider log-transform or robust scaling for skewed features.')
print(' - Impute missing values (median for numeric, mode for categorical) or use model-based imputers.')
print(' - Use SHAP/feature importance after modeling to get clinical explanations.')
