<center><h1 style="font-size:60px;"><b>Data Visualization</b></h1></center>
<b>This notebook downloads the HeartDisease data from github, preprocess the data and save the data for further usage. It further creates some visualizations of the data such as correlation matrix heatmap, histograms and PCA scatter plots </b>


<h2 style="font-size:60px;"><b>Import Packages</b></h2>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import seaborn as sns
import numpy as np
from sklearn.utils import resample
from dython import nominal
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from mpl_toolkits.axes_grid1 import make_axes_locatable

<h2 style="font-size:60px;"><b>Get the Data from Github, Clean, and save for further usage</b></h2>

In [None]:
correlation_heartfailure_df = pd.read_csv(
    "https://raw.githubusercontent.com/mrsonuk/LDS_Test/main/Test_data.csv", index_col=0
)

In [None]:
correlation_heartfailure_df = correlation_heartfailure_df.rename(
    columns={"Dead/Alive": "DeadorAlive"}
)
correlation_heartfailure_df["GenHealth"] = correlation_heartfailure_df[
    "GenHealth"
].replace(["Goo_d", "goo_d", "ExCellent"], ["Good", "Good", "Excellent"])

correlation_heartfailure_df["GenHealth"] = correlation_heartfailure_df[
    "GenHealth"
].replace(["Poor", "Fair", "Good", "Very good", "Excellent"], [0, 1, 2, 3, 4])

correlation_heartfailure_df["AgeCategory"] = correlation_heartfailure_df[
    "AgeCategory"
].replace(
    [
        "18-24",
        "25-29",
        "35-39",
        "30-34",
        "40-44",
        "45-49",
        "50-54",
        "55-59",
        "60-64",
        "65-69",
        "70-74",
        "75-79",
        "80 or older",
    ],
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
)

correlation_heartfailure_df = correlation_heartfailure_df.drop(
    ["Patient_ID", "date"], axis=1
)
heartfailure_df = correlation_heartfailure_df

correlation_heartfailure_df.to_csv("Heart_disease.csv")

<h2 style="font-size:60px;"><b>Correlation Heatmap</b></h2>

In [None]:
res = nominal.associations(
    correlation_heartfailure_df,
    [
        "HeartDisease",
        "Smoking",
        "AlcoholDrinking",
        "Stroke",
        "DiffWalking",
        "Sex",
        "Race",
        "Diabetic",
        "PhysicalActivity",
        "Asthma",
        "KidneyDisease",
        "SkinCancer",
        "DeadorAlive",
    ],
    nom_nom_assoc="cramer",
    # filename = "correlation_of_the_features.png",
    figsize=(10, 10),
)

In [None]:
res["ax"].set_title("Features correlation Heatmap", fontsize=25)
res["ax"].figure

<h2 style="font-size:60px;"><b>Pre-processing</b></h2>

In [None]:
###make the patient with HeartDisease and without HeartDisease equal
####downsampling:

heartfailure_df_yes = heartfailure_df[heartfailure_df["HeartDisease"] == "Yes"]
heartfailure_df_no = heartfailure_df[heartfailure_df["HeartDisease"] == "No"]
heartfailure_df_no_downsample = resample(
    heartfailure_df_no,
    replace=True,
    n_samples=len(heartfailure_df_yes),
    random_state=42,
)
heartfailure_df_downsampled = pd.concat(
    [heartfailure_df_no_downsample, heartfailure_df_yes]
)

<h2 style="font-size:60px;"><b>Some figures</b></h2>

In [None]:
heartfailure_df_no_downsample.columns

In [None]:
##please choose column you would like to visualize on histogram plot from the show_column.
show_columns = [
    "HeartDisease",
    "BMI",
    "Smoking",
    "AlcoholDrinking",
    "Stroke",
    "PhysicalHealth",
    "MentalHealth",
    "DiffWalking",
    "Sex",
    "AgeCategory",
    "Race",
    "Diabetic",
    "PhysicalActivity",
    "GenHealth",
    "SleepTime",
    "Asthma",
    "KidneyDisease",
    "SkinCancer",
    "DeadorAlive",
]

data2_column = "PhysicalHealth"
data3_column = "MentalHealth"
data4_column = "SleepTime"
data5_column = "Stroke"
data6_column = "SkinCancer"

labels = ["No HeartDisease", "HeartDisease present"]

In [None]:
fig = plt.figure(figsize=(10, 20))
ax0 = fig.add_subplot(321)
ax1 = fig.add_subplot(322)
ax2 = fig.add_subplot(323)
ax3 = fig.add_subplot(324)
ax4 = fig.add_subplot(325)
ax5 = fig.add_subplot(326)


ax0.hist(
    heartfailure_df_no_downsample.BMI,
    color="pink",
    label="No HeartDisease",
    density=True,
)
ax0.hist(
    heartfailure_df_yes.BMI,
    color="green",
    label="HeartDisease present",
    alpha=0.6,
    density=True,
)
ax0.set_title("BMI", fontweight="bold")
ax0.set_xlabel("BMI values")
ax0.set_ylabel("Frequency (percentage)")
ax0.legend()

data2 = [
    eval("heartfailure_df_no_downsample." + data2_column),
    eval("heartfailure_df_yes." + data2_column),
]
ax1.hist(
    data2,
    color=["pink", "green"],
    label=labels,
    density=True,
)
ax1.set_title(data2_column, fontweight="bold")
ax1.set_xlabel(data2_column)
ax1.set_ylabel("Frequency (percentage)")
ax1.legend()


data3 = [
    eval("heartfailure_df_no_downsample." + data3_column),
    eval("heartfailure_df_yes." + data3_column),
]
ax2.hist(
    data3,
    color=["pink", "green"],
    label=labels,
    density=True,
)
ax2.set_title(data3_column, fontweight="bold")
ax2.set_xlabel(data3_column)
ax2.set_ylabel("Frequency (percentage)")
ax2.legend()

data4 = [
    eval("heartfailure_df_no_downsample." + data4_column),
    eval("heartfailure_df_yes." + data4_column),
]
ax3.hist(
    data4,
    color=["pink", "green"],
    label=labels,
    density=True,
)
ax3.set_title(data4_column, fontweight="bold")
ax3.set_xlabel(data4_column)
ax3.set_ylabel("Frequency (percentage)")
ax3.legend()


data5 = [
    eval("heartfailure_df_no_downsample." + data5_column),
    eval("heartfailure_df_yes." + data5_column),
]
ax4.hist(
    data5,
    color=["pink", "green"],
    label=labels,
    density=True,
)
ax4.set_title(data5_column, fontweight="bold")
ax4.set_xlabel(data5_column)
ax4.set_ylabel("Frequency (percentage)")
ax4.legend()


data6 = [
    eval("heartfailure_df_no_downsample." + data6_column),
    eval("heartfailure_df_yes." + data6_column),
]
ax5.hist(
    data6,
    color=["pink", "green"],
    label=labels,
    density=True,
)
ax5.set_title(data6_column, fontweight="bold")
ax5.set_xlabel(data6_column)
ax5.set_ylabel("Frequency (percentage)")
ax5.legend()


plt.show()

<h2 style="font-size:60px;"><b>Pre-processing for PCA</b></h2>

In [None]:
heartfailure_df_downsampled = pd.get_dummies(
    heartfailure_df_downsampled, columns=["Diabetic", "Race"]
)
heartfailure_df_downsampled = heartfailure_df_downsampled.rename(
    columns={
        "Diabetic_No, borderline diabetes": "DiabeticBorderline",
        "Diabetic_Yes (during pregnancy)": "DiabeticYesPregnancy",
        "Race_American Indian/Alaskan Native": "RaceAmericanIndianAlaskanNative",
        "Race_Asian": "RaceAsian",
        "Race_Black": "RaceBlack",
        "Race_Hispanic": "RaceHispanic",
        "Race_Other": "RaceOther",
        "Race_White": "RaceWhite",
    }
)
heartfailure_df_downsampled = heartfailure_df_downsampled.drop(columns=["DeadorAlive"])


categorical_features = [
    "HeartDisease",
    "Smoking",
    "AlcoholDrinking",
    "Stroke",
    "DiffWalking",
    "PhysicalActivity",
    "Asthma",
    "KidneyDisease",
    "SkinCancer",
]
for feature in categorical_features:
    heartfailure_df_downsampled[feature] = heartfailure_df_downsampled[feature].replace(
        ["No", "Yes"], [0, 1]
    )

heartfailure_df_downsampled["Sex"] = heartfailure_df_downsampled["Sex"].replace(
    ["Male", "Female"], [0, 1]
)

continous_and_count_data_and_ordinal = [
    "BMI",
    "SleepTime",
    "PhysicalHealth",
    "MentalHealth",
    "AgeCategory",
    "GenHealth",
]
scaler = StandardScaler()
for candcount in continous_and_count_data_and_ordinal:
    scaler = StandardScaler()
    scaler.fit(heartfailure_df_downsampled[[candcount]])
    heartfailure_df_downsampled[[candcount]] = scaler.transform(
        heartfailure_df_downsampled[[candcount]]
    )

<h2 style="font-size:60px;"><b>PCA plots</b></h2>

In [None]:
pca = PCA(n_components=2)
components = pca.fit_transform(heartfailure_df_downsampled)

In [None]:
heartfailure_df_downsampled.columns

In [None]:
##please choose (by index) column you would like to visualize on PCA plot alongside the HeartDisease PCA plot
show_columns = [
    "HeartDisease",
    "BMI",
    "Smoking",
    "AlcoholDrinking",
    "Stroke",
    "PhysicalHealth",
    "MentalHealth",
    "DiffWalking",
    "Sex",
    "AgeCategory",
    "PhysicalActivity",
    "GenHealth",
    "SleepTime",
    "Asthma",
    "KidneyDisease",
    "SkinCancer",
    "Diabetic_No",
    "DiabeticBorderline",
    "Diabetic_Yes",
    "DiabeticYesPregnancy",
    "RaceAmericanIndianAlaskanNative",
    "RaceAsian",
    "RaceBlack",
    "RaceHispanic",
    "RaceOther",
    "RaceWhite",
]
show_column = show_columns[9]

In [None]:
fig = plt.figure(figsize=(10, 6))
ax0 = fig.add_subplot(121)
ax1 = fig.add_subplot(122)

ax0.set_title("HeartDisease", fontweight="bold")
im0 = ax0.scatter(
    components[:, 0],
    components[:, 1],
    c=heartfailure_df_downsampled["HeartDisease"],
    cmap="viridis",
    alpha=0.5,
)
fig.colorbar(im0, ax=ax0)

ax1.set_title(show_column, fontweight="bold")
im1 = ax1.scatter(
    components[:, 0],
    components[:, 1],
    c=heartfailure_df_downsampled[show_column],
    cmap="gnuplot",
    alpha=0.5,
)
fig.colorbar(im1, ax=ax1)
plt.show()
print(
    "High values of all features indicate increased presence of such features except physicalhealth and mentalhealth"
)