In [1]:
import pandas as pd
import numpy as np

# --- 1. Load the Dataset ---

In [2]:
# Read the data that is inside of the CSV
df = pd.read_csv("./Health_Data/cleaned_health.csv")
df

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,cp_0,cp_1,cp_2,cp_3
0,52,1,125,212,0,1,168,0,1.0,2,2,3,0,1,0,0,0
1,53,1,140,203,1,0,155,1,3.1,0,0,3,0,1,0,0,0
2,70,1,145,174,0,1,125,1,2.6,0,0,3,0,1,0,0,0
3,61,1,148,203,0,1,161,0,0.0,2,1,3,0,1,0,0,0
4,62,0,138,294,1,1,106,0,1.9,1,3,2,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,68,0,120,211,0,0,115,0,1.5,1,0,2,1,0,0,1,0
298,44,0,108,141,0,1,175,0,0.6,1,0,2,1,0,0,1,0
299,52,1,128,255,0,1,161,1,0.0,2,1,3,0,1,0,0,0
300,59,1,160,273,0,0,125,0,0.0,2,0,2,0,0,0,0,1


In [3]:
NUMERICAL_COLS = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
CATEGORICAL_COLS = [
    'sex', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal',
    'cp_0', 'cp_1', 'cp_2', 'cp_3', 'target'
]

# --- 2. Perform Descriptive Statistics for Numerical Columns ---

In [4]:
print("\n--- Descriptive Statistics for Numerical Columns ---")
print("Summary statistics (count, mean, std, min, 25%, 50% (median), 75%, max):")
# Select only numerical columns that are actually in the DataFrame
actual_numerical_cols = [col for col in NUMERICAL_COLS if col in df.columns]

if actual_numerical_cols:
    # Ensure numerical columns are numeric type before describing
    for col in actual_numerical_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    print(df[actual_numerical_cols].describe().round(2)) # Round for cleaner output
else:
    print("No numerical columns found from the defined list.")


--- Descriptive Statistics for Numerical Columns ---
Summary statistics (count, mean, std, min, 25%, 50% (median), 75%, max):
          age  trestbps    chol  thalach  oldpeak
count  302.00    302.00  302.00   302.00   302.00
mean    54.42    131.60  246.50   149.57     1.04
std      9.05     17.56   51.75    22.90     1.16
min     29.00     94.00  126.00    71.00     0.00
25%     48.00    120.00  211.00   133.25     0.00
50%     55.50    130.00  240.50   152.50     0.80
75%     61.00    140.00  274.75   166.00     1.60
max     77.00    200.00  564.00   202.00     6.20


# --- 3. Perform Frequency Counts for Categorical Columns ---

In [5]:
print("\n--- Frequency Counts for Categorical Columns ---")
# Select only categorical columns that are actually in the DataFrame
actual_categorical_cols = [col for col in CATEGORICAL_COLS if col in df.columns]

if actual_categorical_cols:
    for col in actual_categorical_cols:
        print(f"\n--- Column: '{col}' ---")
        # Handle potential NaNs in categorical columns before value_counts
        # If NaNs are present, decide whether to include them in counts or not
        # .fillna('Missing') can be used if you want to count NaNs as a category
        # .dropna() can be used if you want to exclude NaNs from counts
        value_counts = df[col].value_counts(dropna=False) # Include NaN counts
        percentages = df[col].value_counts(normalize=True, dropna=False).mul(100).round(2) # Calculate percentages

        # Combine counts and percentages into a single DataFrame for better readability
        summary_df = pd.DataFrame({'Count': value_counts, 'Percentage': percentages})
        print(summary_df)
else:
    print("No categorical columns found from the defined list.")


--- Frequency Counts for Categorical Columns ---

--- Column: 'sex' ---
   Count  Percentage
1    206       68.21
0     96       31.79

--- Column: 'fbs' ---
   Count  Percentage
0    257        85.1
1     45        14.9

--- Column: 'restecg' ---
   Count  Percentage
1    151       50.00
0    147       48.68
2      4        1.32

--- Column: 'exang' ---
   Count  Percentage
0    203       67.22
1     99       32.78

--- Column: 'slope' ---
   Count  Percentage
2    141       46.69
1    140       46.36
0     21        6.95

--- Column: 'ca' ---
   Count  Percentage
0    175       57.95
1     65       21.52
2     38       12.58
3     20        6.62
4      4        1.32

--- Column: 'thal' ---
   Count  Percentage
2    165       54.64
3    117       38.74
1     18        5.96
0      2        0.66

--- Column: 'cp_0' ---
   Count  Percentage
0    159       52.65
1    143       47.35

--- Column: 'cp_1' ---
   Count  Percentage
0    252       83.44
1     50       16.56

--- Column: 'cp_2'

# --- 4. Special Case: Relationship between 'target' and other categorical features ---

In [6]:
print("\n--- Relationship with 'target' variable (Disease Presence) ---")
print("Cross-tabulations and percentages for key categorical features vs. 'target':")

# Example: Sex vs. Target
if 'sex' in df.columns and 'target' in df.columns:
    print("\nSex vs. Target:")
    sex_target_crosstab = pd.crosstab(df['sex'], df['target'])
    print(sex_target_crosstab)
    print("\nPercentages (rows sum to 100%):")
    print(pd.crosstab(df['sex'], df['target'], normalize='index').mul(100).round(2))

# Example: Chest Pain Type (cp_0 to cp_3) vs. Target
# We can reconstruct 'cp' for better readability if needed, or use the one-hot encoded columns directly
# For simplicity, let's look at each cp_X column against target
for cp_col in ['cp_0', 'cp_1', 'cp_2', 'cp_3']:
    if cp_col in df.columns and 'target' in df.columns:
        print(f"\n{cp_col} (Chest Pain Type) vs. Target:")
        cp_target_crosstab = pd.crosstab(df[cp_col], df['target'])
        print(cp_target_crosstab)
        print("\nPercentages (rows sum to 100%):")
        print(pd.crosstab(df[cp_col], df['target'], normalize='index').mul(100).round(2))

# You can add more cross-tabulations for other categorical columns as needed
# e.g., 'fbs' vs 'target', 'exang' vs 'target'
if 'fbs' in df.columns and 'target' in df.columns:
    print("\nFasting Blood Sugar (fbs) vs. Target:")
    fbs_target_crosstab = pd.crosstab(df['fbs'], df['target'])
    print(fbs_target_crosstab)
    print("\nPercentages (rows sum to 100%):")
    print(pd.crosstab(df['fbs'], df['target'], normalize='index').mul(100).round(2))

if 'exang' in df.columns and 'target' in df.columns:
    print("\nExercise Induced Angina (exang) vs. Target:")
    exang_target_crosstab = pd.crosstab(df['exang'], df['target'])
    print(exang_target_crosstab)
    print("\nPercentages (rows sum to 100%):")
    print(pd.crosstab(df['exang'], df['target'], normalize='index').mul(100).round(2))

print("\nDescriptive statistics generation complete.")


--- Relationship with 'target' variable (Disease Presence) ---
Cross-tabulations and percentages for key categorical features vs. 'target':

Sex vs. Target:
target    0   1
sex            
0        24  72
1       114  92

Percentages (rows sum to 100%):
target      0      1
sex                 
0       25.00  75.00
1       55.34  44.66

cp_0 (Chest Pain Type) vs. Target:
target    0    1
cp_0            
0        34  125
1       104   39

Percentages (rows sum to 100%):
target      0      1
cp_0                
0       21.38  78.62
1       72.73  27.27

cp_1 (Chest Pain Type) vs. Target:
target    0    1
cp_1            
0       129  123
1         9   41

Percentages (rows sum to 100%):
target      0      1
cp_1                
0       51.19  48.81
1       18.00  82.00

cp_2 (Chest Pain Type) vs. Target:
target    0   1
cp_2           
0       120  96
1        18  68

Percentages (rows sum to 100%):
target      0      1
cp_2                
0       55.56  44.44
1       20.93  79.07

c