In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

After importing the needed libraries. We wanted to read out the data in our csv file

In [None]:
path = '2022/heart_2022_with_nans.csv'
data = pd.read_csv(path)
#print the data out
print(data.head())

Then we check for a list of all columns in our record and drop all data that is empty to clean things out

In [None]:
# Step 1: List all columns and number of records
print("Columns:", data.columns)
print("Number of records:", data.shape[0])

# Step 2: Check for missing values
missing_values = data[data.isna()]
print("Missing values: ", missing_values)

Our last steps of the cleaning process is to check for missing columns 

In [None]:
# Step 3: Drop rows with missing values
data_cleaned = data.dropna()
print("Number of records after dropping missing values:", data_cleaned.shape[0])
print("Missing values after dropping: ", data_cleaned[data_cleaned.isna()])

Next We wanted to display a few basic visualizations. We decided to create a bar graph showing the correlation of heart attacks and a persons sex and general health. 

In [None]:
categorical_cols = ['Sex', 'GeneralHealth']

for col in categorical_cols:
    plt.figure(figsize=(8, 5))
    sns.countplot(data=data_cleaned, x=col, hue='HadHeartAttack')
    plt.title(f'{col} Distribution by Heart Attack Status')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.legend(title='Had Heart Attack')
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.show()

For our Histograms we decided to check the correlations between BMI and Sleephours. 

In [None]:
#BMI and SleepHours are numeric columns
numeric_cols = ['BMI', 'SleepHours']

for col in numeric_cols:
    plt.figure(figsize=(8, 5))
    sns.histplot(data=data_cleaned, x=col, hue='HadHeartAttack', kde=True, bins=30)
    plt.title(f'{col} Distribution by Heart Attack Status')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

And for our Pie chars we decided to check correlations between covid positive and negative heart attack risks along with Flu vaxinated persons and their risk of heart attack 

In [None]:
binary_cols = ['CovidPos', 'FluVaxLast12']

for col in binary_cols:
    # Cross-tab for better labeling
    cross = pd.crosstab(data_cleaned[col], data['HadHeartAttack'])

    for val in cross.index:
        plt.figure(figsize=(5, 5))
        plt.pie(cross.loc[val], labels=['No Heart Attack', 'Heart Attack'], 
                autopct='%1.1f%%', startangle=90, colors=['#66b3ff', '#ff9999'])
        plt.title(f'Heart Attack Rate for {col} = {val}')
        plt.tight_layout()
        plt.show()

Now comes the deeper push! We worked together to find a way of creating a HeatMap of of all the columns in our data set and this allows us to look deeper into potential correlation  and determine if any can be considered causation.

In [None]:

# 1. Copy & encode your target as numeric
df_hm = data_cleaned.copy()
df_hm['HadHeartAttack_flag'] = df_hm['HadHeartAttack'].map({'No': 0, 'Yes': 1})

# 2. Identify your original numeric features (exclude the flag for now)
numeric_feats = df_hm.select_dtypes(include='number').columns.drop('HadHeartAttack_flag')

# 3. Build a full corr matrix including the flag
all_cols = list(numeric_feats) + ['HadHeartAttack_flag']
corr = df_hm[all_cols].corr()

# 4. Compute each feature’s correlation to the target
corr_with_target = corr['HadHeartAttack_flag']

# 5. Overwrite the diagonal entries for the original features
for feat in numeric_feats:
    corr.at[feat, feat] = corr_with_target[feat]

# 6. Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(
    corr,
    annot=True,            # show correlation coefficients
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    square=True,
    linewidths=.5
)
plt.title("Correlation Heatmap\n(diagonal = corr(feature, HadHeartAttack))")
plt.tight_layout()
plt.show()

Q: What are some limitations of this dataset?

A: General Health being non numbered is definently a limitation of this data set.

Q: What kinds of biases or missing contexts might affect a model trained on this data?

A: This data set may not represent the entire population and may have skewed numbers in relation to age groups, genders, and geographic locations. 

Q: If you built an AI model using this dataset, what are some ethical or practical concerns you'd want to raise?

A: I would ensure that the model is trained only using clinical data filtered by medical professionals.

Q: How did you use AI in this process? Was it helpful? Frustrating? Would you do anything differently next time?

A: AI is okay in helping from time to time. I tried to use ChatGPT for the data visualization and it didn't understand what I wanted for the heat map. It lacked a lot of context and understanding of the columns and parameters we were working with. It requires a good chunk of fine tuning in order to get the result you were looking for.