In [None]:
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')

1. Age: age of the patient [years]
2. Sex: sex of the patient [M: Male, F: Female]
3. ChestPainType: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
4. RestingBP: resting blood pressure [mm Hg]
5. Cholesterol: serum cholesterol [mm/dl]
6. FastingBS: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
7. RestingECG: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST 8. elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
MaxHR: maximum heart rate achieved [Numeric value between 60 and 202]
9. ExerciseAngina: exercise-induced angina [Y: Yes, N: No]
10. Oldpeak: oldpeak = ST [Numeric value measured in depression]
11. ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
12. HeartDisease: output class [1: heart disease, 0: Normal]

In [None]:
# pd.set_option('display.max_columns')
df = pd.read_csv('heart.csv')
df.head()

In [None]:
df.shape

In [None]:
# check the null value columns
df.columns[df.isnull().any()].to_list() 

In [None]:
numerical_feat = df.columns[df.dtypes != 'object'].to_list()
categorical_feat = df.columns[df.dtypes == 'object'].to_list()

numerical_feat

In [None]:
categorical_feat

In [None]:
for col in categorical_feat:
    print(col, "========>", df[col].unique())

In [None]:
# categorical to numerical

df["Sex"].replace({'M': 0, 'F': 1}, inplace=True)
df["ChestPainType"].replace({"ATA": 0, "NAP": 1, "ASY": 2, "TA": 1}, inplace=True)
df["RestingECG"].replace({"Normal": 0, "ST": 1, "LVH": 2}, inplace=True)
df["ExerciseAngina"].replace({'N': 0, 'Y': 1}, inplace=True)
df["ST_Slope"].replace({"Up": 0, "Flat": 1, "Down": 2}, inplace=True)

In [None]:
# checking the imbalance or balance

df.HeartDisease.value_counts()

In [None]:
# plt.figure(figsize=(20, 12))
# sns.heatmap(df.corr(), annot=True)
# plt.show()

In [None]:
def PearsonCorrelation(data=None, thersold=0.5):
  cor_col = set()
  corr_matrix = data.corr()
  for i in range(len(corr_matrix.columns)):
    for j in range(i):
      if abs(corr_matrix.iloc[i,j]) > thersold:
        cor_col.add(corr_matrix.columns[i])
  return cor_col

In [None]:
PearsonCorrelation(df, 0.2)

In [None]:
predictors = [x for x in df.columns.to_list() if x != "HeartDisease"]
target = "HeartDisease"

In [None]:
predictors

In [None]:
def OutlierRatio(df):
  q1 = df.quantile(0.25)
  q3 = df.quantile(0.75)
  IQR = q3 - q1
  total_outliers = ((df < (q1 - 1.5 * IQR)) | (df > (q3 + 1.5 * IQR))).sum().sort_values(ascending=False)
  total_outl_per = ((total_outliers/ ((df < (q1 - 1.5 * IQR)) | (df > (q3 + 1.5 * IQR))).count()) * 100).sort_values(ascending=False)
  return pd.concat([total_outliers, total_outl_per], axis=1, keys=['Outliers Total', 'Percentage(%)'])

In [None]:
OutlierRatio(df[predictors])

In [None]:
def kde_numerical_features(column, target) : 
    fig, ax = plt.subplots(figsize = (13,5))
    sns.kdeplot(df[df[target]==1][column], alpha=0.5,shade = True, color="red", label="HeartDisease", ax = ax)
    sns.kdeplot(df[df[target]==0][column], alpha=0.5,shade = True, color="green", label="NoHeartDisease ", ax = ax)
    plt.title('KDE-Plot of {}'.format(column), fontsize = 18)
    ax.set_xlabel(column)
    ax.set_ylabel("Frequency")
    ax.legend()
    plt.show()

In [None]:
for nfeat in numerical_feat:
    if nfeat != "HeartDisease":
        kde_numerical_features(nfeat, 'HeartDisease')


In [None]:
for cfeat in categorical_feat:
    kde_numerical_features(cfeat, "HeartDisease")

In [None]:
# which gender get more heart disease
# 0 - male, 1 - female
# plt.figure(figsize=(12,9))
fig, ax = plt.subplots(figsize = (13,5))
sns.countplot(x=df.Sex)
ax.set_xlabel("Gender")
ax.set_ylabel("Count")

In [None]:
from matplotlib.gridspec import GridSpec

In [None]:
plt.figure(figsize=(12,3))
sns.boxplot(x=df.Age, color="green")
plt.show()

In [None]:
plt.figure(figsize=(12,3))
sns.boxplot(x=df.RestingBP, color="green")
plt.show()

In [None]:
# old peak influence
plt.figure(figsize=(12,4))
oldpeak = sns.lineplot(x="Oldpeak", y="HeartDisease", data=df[['Oldpeak', 'HeartDisease']], color="red")
oldpeak.set_title("Oldpeak Influence")
plt.show()

In [None]:
plt.figure(figsize=(12,4))
oldpeak = sns.lineplot(x="Age", y="HeartDisease", data=df[['Age', 'HeartDisease']], color="red")
oldpeak.set_title("Age Influence")
plt.show()