In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

In [None]:
# Heart Disease Dataset Columns
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
           'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

quant_columns = ['age', 'chol', 'thalach', 'oldpeak']

cat_columns = ['sex', 'ca', 'thal', 'cp', 'trestbps', 'fbs', 'restecg', 'exang', 'slope', 'target']

In [None]:
len(columns)

In [None]:
df = pd.read_csv('./processed.cleveland.data', names=columns,
                 na_values=None, skipinitialspace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include=['O'])

In [None]:
df['age'].kurtosis()

In [None]:
for column in quant_columns:
    print(f'Mode for {column}')
    print(df[column].mode())
    print('\n')
    print(f'Skewness for {column}')
    print(df[column].skew())
    print('\n')
    print(f'Kurtosis for {column}')
    print(df[column].kurtosis())
    print('--------------------')
    print('\n')

In [None]:
# IQR - Q3 - Q1
q3, q1 = np.percentile(df['chol'], [75, 25])
IQR = q3 - q1
print('IQR for Cholesterol: ', IQR)

In [None]:
for col in quant_columns:
    fig = plt.figure(figsize=(5, 5))
    sns.histplot(x=df[col], kde=True)
    plt.title(f'Distribution for {col} variable')
    plt.show()

In [None]:
# KDE
for col in quant_columns:
    plt.figure(figsize=(5, 5))
    sns.kdeplot(data=df, x=col)
    plt.title(f'Kernel Density plot of {col} Variable')
    plt.show()

In [None]:
for col in cat_columns:
    fig = plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x=col)
    plt.title(f'Frequency of {col}')
    plt.ylabel('Frequency')
    plt.xlabel(f'{col}')
    plt.show()

In [None]:
for col in quant_columns:
    fig = plt.figure(figsize=(5, 5))
    sns.boxplot(data=df, y=col)
    plt.title(f'Box and Whiskers plot of {col}')
    plt.show()

In [None]:
for cat in cat_columns:
    for quant in quant_columns:
        fig = plt.figure(figsize=(5, 5))

        # Barplot
        sns.barplot(x=cat, y=quant, data=df, ci=None)

        # Plot Title and Labels
        plt.title(f'Average {quant} by {cat}')
        plt.xlabel(cat)
        plt.ylabel(f'Average {quant}')

        plt.show()

In [None]:
for cat in cat_columns:
    for quant in quant_columns:
        fig = plt.figure(figsize=(5, 5))

        # Boxplot
        sns.boxplot(x=cat, y=quant, data=df)

        # Plot Title and Labels
        plt.title(f'Average {quant} by {cat}')
        plt.xlabel(cat)
        plt.ylabel(f'Average {quant}')

        plt.show()

In [None]:
for quant in quant_columns:
    for cat1 in cat_columns:
        for cat2 in cat_columns:
            if cat1 != cat2:
                # Catplot
                sns.catplot(x=cat1, y=quant, hue=cat2, kind='box', data=df)

                # Title
                plt.title(f'Distribution of {quant} by {cat1} and {cat2}')
                plt.xlabel(cat1)
                plt.ylabel(quant)

                plt.show()

In [None]:
for quant in quant_columns:
    for cat1 in cat_columns:
        for cat2 in cat_columns:
            if cat1 != cat2:
                # Catplot
                sns.catplot(x=cat1, y=quant, hue=cat2, kind='violin', data=df)

                # Title
                plt.title(f'Distribution of {quant} by {cat1} and {cat2}')
                plt.xlabel(cat1)
                plt.ylabel(quant)

                plt.show()

In [None]:
for i in range(len(quant_columns)):
    for j in range(i + 1, len(quant_columns)):
        quant1 = quant_columns[i]
        quant2 = quant_columns[j]

        # Create a scatterplot
        fig = plt.figure(figsize=(5, 5))
        sns.scatterplot(x=quant1, y=quant2, data=df)

        # Title and Labels
        plt.title(f'Scatterplot of {quant1} vs {quant2}')
        plt.xlabel(quant1)
        plt.ylabel(quant2)

        plt.show()

In [None]:
sns.pairplot(df[quant_columns], diag_kind='dke')
plt.suptitle('Pairplot of Quantitative Variables in the Heart Disease Dataset')
plt.show()

In [None]:
correlation_matrix = df[quant_columns].corr()

# heatmap correlation
fig = plt.figure(figsize=(14,12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', square=True, vmin=-1, vmax=1)

plt.title('Correlation Heatmap of Quantitative Variables in Hear Disease Dataset')

plt.show()

In [None]:
# separate the categories into a list and the quantitative variables into another list

for cat in cat_columns:
    for quant in quant_columns:
        print(f'---------- {quant} by {cat} ----------')
        display(df.groupby(cat)[quant].describe().reset_index())

In [None]:
df.corr

In [None]:
df.cov