# Dataset (Population) Differences

We're now going to investigate the population differences between datasets. This is going to be done over a few dimensions:

1. **Dataset Size & Completeness**: How many observations each dataset has and the percent of rows that do not contain any missing values.
2. **Feature Level Missingness**: Compare which features are missing in each dataset and in what quantities.
3. **CVD Class Distribution**: Test whether disease severity differs across population using chi-square (*super important and useful to know*).
4. **Numeric Features**: Compare age, blood pressure, cholesterol, etc. across datasets using Kruskal-Wallis tests (non-parametric alternative to ANOVA) 
5. **Categorical Features**: Analyzes sex, chest pain types, etc. with chi-square tests
6. **Correlation Structure**: Observe if feature relationships differ by population

This analysis helps with building a multi-class CVD prediction model for a few reasons, mainly: 

- Population differences may require different feature weights.
- Different CVD prevalence means you'll need stratified sampling.
- Common risk factors to the development of cardiovascular disease can be validated by looking at which features are consistently associated with CVD Class across all datasets.

In [None]:
from utils.data_quality import calculate_missingness_summary, create_data_quality_report
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns
from scipy.stats import chi2_contingency

ImportError: cannot import name 'missingness_report' from 'utils.data_quality' (/home/darkii/Heart-Disease-Prediction/utils/data_quality.py)

In [None]:
from load_datasets import df_combined as df, DATASET_NAMES

In [4]:
df.head()

Unnamed: 0,Age,Sex,Chest Pain,Rest BP,Chol,FBS,Rest ECG,Max HR,Ex Angina,Oldpeak,Slope,Ca,Thal,CVD Class,Dataset
0,44,1,4,130,209,0,1,127,0,0.0,?,?,?,0,VA Long Beach
1,60,1,4,132,218,0,1,140,1,1.5,3,?,?,2,VA Long Beach
2,55,1,4,142,228,0,1,149,1,2.5,1,?,?,1,VA Long Beach
3,66,1,3,110,213,1,2,99,1,1.3,2,?,?,0,VA Long Beach
4,66,1,3,120,0,0,1,120,0,-0.5,1,?,?,0,VA Long Beach


In [None]:
missingness_comparison = calculate_missingness_summary(df)

In [None]:

overview_stats = create_data_quality_report(df)

NameError: name 'create_data_quality_report' is not defined

In [None]:
from utils.data_visualizations import plot_missingness_heatmap 

In [None]:
# Visualize missingness patterns
fig, ax = plt.subplots(ncols=2, figsize=(14, 5))

missingness_comparison.T.plot(kind='barh', ax=ax[0])
ax[0].set_title('Feature Missingness Across Datasets')
ax[0].set_ylabel('Dataset')
ax[0].set_xlabel('Missing (%)')
ax[0].legend(title='Features', bbox_to_anchor=(1.05, 1), loc='upper left')
ax[0].grid(axis='y', alpha=0.3)

# Heatmap version
sns.heatmap(missingness_comparison, annot=True, fmt='.1f', cmap='YlOrRd', 
            cbar_kws={'label': 'Missing %'}, ax=ax[1])
ax[1].set_title('Missingness Heatmap: Features vs Datasets')
ax[1].set_xlabel('Dataset')
ax[1].set_ylabel('Feature')

plt.tight_layout()
plt.show()

In [None]:
# Cross-tabulation
cvd_crosstab = pd.crosstab(df['Dataset'], df['CVD Class'], 
                            margins=True, margins_name='Total')
print("\nAbsolute counts:")
print(cvd_crosstab)

# Percentage distribution (excluding margins)
cvd_pct = pd.crosstab(df['Dataset'], df['CVD Class'], 
                      normalize='index') * 100
print("\nPercentage distribution:")
print(cvd_pct.round(2))

# Chi-square test for independence
contingency = pd.crosstab(df['Dataset'], df['CVD Class'])
chi2, p_value, dof, expected = chi2_contingency(contingency)

print(f"\nChi-square test for independence:")
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p_value:.4e}")
print(f"Degrees of freedom: {dof}")

In [None]:
# Visualizing the CVD class distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

colors= ['#2ecc71', '#f39c12', '#e67e22', '#e74c3c', '#9b59b6']

# Stacked bar chart
cvd_pct.plot(kind='bar', stacked=True, ax=axes[0], color=colors)
axes[0].set_title('CVD Class Distribution by Dataset (Stacked %)')
axes[0].set_xlabel('Dataset')
axes[0].set_ylabel('Percentage')
axes[0].legend(title='CVD Class', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right')

# Grouped bar chart
cvd_pct.plot(kind='bar', ax=axes[1], color=colors)
axes[1].set_title('CVD Class Distribution by Dataset (Grouped %)')
axes[1].set_xlabel('Dataset')
axes[1].set_ylabel('Percentage')
axes[1].legend(title='CVD Class', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Visualize categorical features
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, feature in enumerate(categorical_features):
    pct_tab = pd.crosstab(df['Dataset'], df[feature], 
                          normalize='index') * 100
    pct_tab.plot(kind='bar', ax=axes[idx], stacked=False)
    axes[idx].set_title(f'{feature} Distribution', fontweight='bold')
    axes[idx].set_xlabel('Dataset')
    axes[idx].set_ylabel('Percentage')
    axes[idx].legend(title=feature, bbox_to_anchor=(1.05, 1), loc='upper left')
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45, ha='right')
    axes[idx].grid(axis='y', alpha=0.3)

axes[-1].set_visible(False)

plt.tight_layout()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(18, 16))
axes = axes.flatten()

for idx, (name, _df) in enumerate(zip(DATASET_NAMES, dfs)):
    # Select only numeric columns that exist
    numeric_cols = _df.select_dtypes(include=["number"]).columns.tolist()
    if 'Dataset' in numeric_cols:
        numeric_cols.remove('Dataset')
    
    corr_matrix = _df[numeric_cols].corr()
    
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0,
                square=True, ax=axes[idx], cbar_kws={'shrink': 0.8},
                vmin=-1, vmax=1)
    axes[idx].set_title(f'{name} Correlation Matrix')