In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'heart-disease-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5560153%2F9196983%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240826%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240826T093345Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D172c2444e08efcee957bd0198947f7aaf45bfbfda2c2726645123f71769da48edd0a36a4f1c9fbd7e53da9f31dafaa3175415a28948e9156169ddcfb7d868225d50a8b288faa6970c7367b758483927a8ec32e3277074adc159ab43f4b3fb5eda1f5b17eb850e0bdcdf5ba4783d5d87097d8ff2b9e19abc71693508af3f134f0faecbd666733a9468ffa19020c5f179239ef2d0dd91d225559bd070dba02a16acdb633f28f0d69dfad28d72830b029083b02730a5db338b5a52a491a992b8cf8ac771555b47456de233b31e0a67a29291a1fa14df03a3a5327617213a37fe2ddb18514436cdfafe84c08ca00e73c97f28e3cc39052cc9b63deacf83f13eadc88'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


## About dataset:

The heart disease dataset consists of 918 records across 12 columns, covering the important feature variable and target value. The dataset includes various sale categories and attributes such as:

1. **Age:** age of the patient [years]
2. **Sex:** sex of the patient [M: Male, F: Female]
3. **ChestPainType:** chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
4. **RestingBP:** resting blood pressure [mm Hg]
5. **Cholesterol:** serum cholesterol [mm/dl]
6. **FastingBS:** fasting blood sugar [1: if FastingBS > 120 mm/dl, 0: otherwise]
7. **RestingECG:** resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes criteria]
8. **MaxHR:** maximum heart rate achieved [Numeric value between 60 and 202]
9. **ExerciseAngina:** exercise-induced angina [Y: Yes, N: No]
10. **Oldpeak:** oldpeak = ST [Numeric value measured in depression]
11. **ST_Slope:** the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
12. **HeartDisease:** output class [1: heart disease, 0: Normal]

This dataset contains a mix of textual and numerical (float) values, which can be challenging to manage, analyze, and predict the new values' outcomes manually.


## Problem Statement and Approach

To predict whether a patient will have a heart attack, I will use Python and its data science libraries, along with Microsoft Excel.

**Data Preparation**

Python libraries like NumPy and Pandas will be used for data cleaning and manipulation, while Excel will assist with preliminary data inspection.

**Exploratory Data Analysis (EDA)**

Visualization tools such as Matplotlib and Seaborn will help uncover patterns and insights.

**Model Building**

Scikit-learn will be employed to develop and evaluate classification models, aiming to predict heart attack risk based on the provided features.

This combined approach will facilitate thorough data analysis, feature engineering, and model development to accurately predict heart attack likelihood.


In [None]:
# Importing neccesaries libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
import os
file = os.listdir("/kaggle/input/heart-disease-dataset")
print(file)

In [None]:
# Load the dataset
df = pd.read_csv("/kaggle/input/heart-disease-dataset/heart.csv")
df.sample(5)

In [None]:
df.info()

In [None]:
df.describe().T # This will give the statistical summary of numerical columns only

In [None]:
df.describe(include=object).T # this will provide statistcal summary of object columns

In [None]:
# Data Preprocessing
print("Null values in each column:")
print(df.isnull().sum()) # Check for null values

print("\nNumber of duplicate rows:")
print(df.duplicated().sum()) # Check for duplicate rows

print("Number of unique values in each feature (sorted):")
print(df.nunique().sort_values(ascending=False)) # Checking the number of unique values in each feature


In [None]:
# Check frequency of categorical values within selected columns
cat_col = []
for col in df.select_dtypes(include= 'object' ).columns:
    if df[col].nunique() < 5:
        print(df[col].value_counts())
        print('-'*100)
        cat_col.append(col)

### Transforming categorical variable to numerical variable for ease of calculation
    Sex : M = 0 , F = 1
    ChestPainType : ATA = 0 , NAP = 1, ASY = 2, TA = 3
    RestingECG : Normal = 0 , ST = 1, LVH = 2
    ExerciseAngina : N = 0 , Y = 1
    ST_Slope : Up = 0, Flat = 1, Down = 2


**Note:**

Running this code will transform categorical variables into numerical labels in the `df_new` DataFrame. To preserve the original categorical data for reporting or future use, avoid re-running this code. Running it multiple times will overwrite these transformations, making it difficult to recover the original categorical values.

In [None]:
df.describe().T

In [None]:
(df['RestingBP']==0).sum()

In [None]:
(df['Cholesterol']==0).sum()

In [None]:
df_new = df.copy() # Create a copy of the original DataFrame

for col in cat_col:
    print(col)
    unique_values = df_new[col].unique()
    value_mapping = {val: idx for idx, val in enumerate(unique_values)}
    print(unique_values, list(range(len(unique_values))))
    df_new[col] = df_new[col].map(value_mapping)  # Replace categorical values with numerical value
    print('-' * 100)

# Note: The transformations are applied to df_new, leaving df unchanged.

In [None]:
df_new.describe().T

Cholestrol value in dataset have zero value which is wrong entries since cholestrol can be zero and same RestingBP even though its just one value of 0 is there.
To impute these missing value, I will be using KNN Impution for Cholestrol and Mean Impution among available imputation -
* Mean Imputation (**Best For:** When data is normally distributed and there are no significant outliers)
* Mode Imputation (**Best For:** Categorical data or when the numerical column has a few distinct values with one being very frequent)
* Median Imputation (**Best For**: When the data contains outliers or is skewed)
* K-Nearest Neighbors Imputation aka KNN Imputation (**Best For:** When there is a complex relationship between features or when mean/median imputation is not suitable)
* Interpolation (**Best For:** Time series data or when the data is ordered and trends can be exploited)

In [None]:
df_new.sample(5)

In [None]:
# Impute the 0 with KNN
from sklearn.impute import KNNImputer
df_new['Cholesterol'].replace(0, np.nan) # replacing 0 with nan
imputer = KNNImputer(n_neighbors=3)
after_impute = imputer.fit_transform(df_new)
df_new = pd.DataFrame(after_impute, columns=df_new.columns)

In [None]:
df_new.sample(5)

In [None]:
# Change the columns back to int type for ease of calculation except Oldpeak columns
selected_Col = df_new.columns
selected_Col = selected_Col.drop('Oldpeak')
df_new[selected_Col] = df_new[selected_Col].astype('int32')

## **Exploratory Data Analysis**
In the exploratory data analysis (EDA), I will:

**Examine Data Distribution:**

Assess the distribution of each feature to understand their individual characteristics.

**Analyze Feature Correlations:**

Explore the correlation between features and the target variable, HeartDisease.

**Visualize Relationships:**

Use correlation matrices and heatmaps to visualize the relationships between features and the target variable.

In [None]:
# Calculate the correlation between all features and the target variable
df_new.corr()['HeartDisease'][:-1].sort_values().plot(kind='bar')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Hear Disease percentage in dataset
plt.pie(df_new['HeartDisease'].value_counts(), labels=['Yes', 'No'], autopct='%1.1f%%', startangle=90)
plt.title('Heart Disease Percentage')
plt.show()

In [None]:
# count of heart disease cases by gender
sns.countplot(x='Sex', data=df_new, hue='HeartDisease')
plt.title('Gender Distribution')
plt.xlabel('Gender - [M : 0 | F : 1]')
plt.ylabel('People Count')
plt.show()

The count plot reveals the relationship between gender and heart disease, indicating that men have a higher risk of developing heart disease compared to women.

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# catplot for age distribution for heart disease patients
sns.catplot(x="HeartDisease", y="Age", kind="swarm", data=df_new)
plt.show()

In [None]:
# Histogram plot of age distribution for heart disease patients
sns.histplot(data=df_new, x="Age", hue="HeartDisease", multiple="stack", kde=True)
plt.show()

The catplot and histplot show that the majority of patients fall within the age group of *40-70* years. Specifically, individuals aged *50-65* years are more prone to heart disease compared to other age groups. This increased prevalence in the *50-65* year range can be attributed to the higher number of adults within this age bracket, leading to a greater number of heart disease cases compared to other age groups.

In [None]:
# countplot to show the distribution of ChestPainType with respect to HeartDisease
sns.countplot(x='ChestPainType', data=df_new, hue='HeartDisease')
plt.title('ChestPainType Distribution')
plt.xlabel('ChestPainType - [ATA:0 | NAP:1 | ASY:2 | TA:3]')
plt.ylabel('Patients Count')
plt.show()

The countplot indicates that individuals with asymptomatic chest pain (ASY) have a higher risk of heart disease compared to those with other types of chest pain.

In [None]:
# countplot to show the distribution of FastingBS with respect to HeartDisease
sns.countplot(x='FastingBS', data=df_new, hue='HeartDisease')
plt.title('FastingBS Distribution')
plt.xlabel('FastingBS - [No : 0 | Yes : 1]')
plt.ylabel('Patients Count')
plt.show()

The countplot reveals that higher fasting blood sugar levels are associated with an increased risk of heart disease.

In [None]:
# Distribution of maximum heart rate with heart disease
fig,ax = plt.subplots(1,2,figsize=(15,5))
sns.boxplot(x='HeartDisease', y='MaxHR', data=df_new, ax=ax[0]).set_title('MaxHR vs HeartDisease')
sns.violinplot(x='HeartDisease', y='MaxHR', data=df_new, ax=ax[1]).set_title('MaxHR vs HeartDisease')
plt.show()

Both the boxplot and violin plot show an inverse relationship between maximum heart rate (`MaxHR`) and heart disease risk, suggesting that lower `MaxHR` is associated with higher risk. However, the heart disease group exhibits increased `MaxHR` around 100 and a wider range of values between 120-130 compared to the non-heart disease group. This indicates a more complex relationship between `MaxHR` and heart disease risk.

In [None]:
# # Distribution of OldPeak rate with heart disease
fig,ax = plt.subplots(1,2,figsize=(15,5))
sns.boxplot(x='HeartDisease', y='Oldpeak', data=df_new, ax=ax[0]).set_title('Oldpeak vs HeartDisease')
sns.violinplot(x='HeartDisease', y='Oldpeak', data=df_new, ax=ax[1]).set_title('Oldpeak vs HeartDisease')

Both the boxplot and violin plot reveal an relationship between depression (Oldpeak) and heart disease. The graphs suggest that higher levels of depression, as indicated by increased Oldpeak values, are associated with a greater risk of heart disease. Notably, Oldpeak values range from 0 to 6.

In [None]:
# distribution of ExerciseAngina agaisnt herat disease (target Variable)
plt.figure(figsize=(10,6))
plt.pie(df_new['ExerciseAngina'].value_counts(), labels=['No','Yes'], autopct='%1.2f%%')
plt.title('ExerciseAngina Percentage')
plt.show()

In [None]:
sns.countplot(x='ExerciseAngina', data=df_new, hue='HeartDisease')
plt.title('ExerciseAngina Distribution')
plt.xlabel('ExerciseAngina - [No : 0 | Yes : 1]')
plt.ylabel('Count')
plt.show()

Countplot shows ExerciseAngina has high risk of HeartDisease

In [None]:
# ST_Slope and Heart Disease
plt.figure(figsize=(10,6))
plt.pie(df_new['ST_Slope'].value_counts(), labels=['Flat','Upsloping', 'Downsloping'], autopct='%1.2f%%')
plt.title('ST_Slope Percentage')
plt.show()

In [None]:
sns.countplot(x='ST_Slope', data=df_new, hue='HeartDisease')
plt.title('ST_Slope Distribution')
plt.xlabel('ST_Slope - [Up : 0 | Flat : 1 | Down : 2]')
plt.ylabel('Count')
plt.show()

Countplot shows Flat Slope has high risk of HeartDisease

In [None]:
# Heatmap
plt.figure(figsize=(10,10))
sns.heatmap(df_new.corr(), annot=True)
plt.show()

### Data Modeling and training

In [None]:
# split the dataset into training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_new.drop('HeartDisease', axis=1),
                                                    df_new['HeartDisease'], test_size=0.15,
                                                    random_state=101)
# Selects all columns except the 'HeartDisease'
# Rest are considered as features or independent variables used to predict the target variable.

In [None]:
warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning

scaler = StandardScaler() # Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']  # Type of Methods that varies with sample size, small t0 meadium to large
# Small Dataset: Up to a few thousand samples (e.g., 100 to 10,000 samples).
# Medium Dataset: From a few thousand to tens of thousands of samples (e.g., 10,000 to 100,000 samples).
# Large Dataset: Beyond tens of thousands of samples, potentially reaching millions or more.
best_slover = ''
train_scores = np.zeros(len(solvers)) #train_score value initialization
for i, solver in enumerate(solvers):
    try:
        lr = LogisticRegression(solver=solver, max_iter=1000)  # Increase max_iter to 1000
        lr.fit(X_train_scaled, y_train)
        train_scores[i] = lr.score(X_test_scaled, y_test)

        if train_scores[i] == train_scores.max():
            best_solver = solver
    except ConvergenceWarning as e:
        print(f'Warning for solver {solver}: {e}')

# Fit the model with the best solver
lr_best = LogisticRegression(solver=best_solver, max_iter=1000)  # Increase max_iter to 1000
lr_best.fit(X_train_scaled, y_train)
lr_pred = lr_best.predict(X_test_scaled)
print(f'LogisticRegression Score: {accuracy_score(y_test, lr_pred)}')

In [None]:
# SVM
from sklearn.svm import SVC
from sklearn.metrics import f1_score

kernels = {'linear':0, 'poly':0, 'rbf':0, 'sigmoid':0} # Types of methods
best = ''
for i in kernels:
    svm = SVC(kernel=i) #Support Vector Classifier
    svm.fit(X_train, y_train)
    yhat = svm.predict(X_test)
    kernels[i]=f1_score(y_test, yhat, average="weighted")
    if kernels[i] == max(kernels.values()):
        best = i
svm = SVC(kernel=best)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(f'SVM f1_score kernel({best}): {f1_score(y_test, svm_pred, average="weighted")}')

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dtree = DecisionTreeClassifier(class_weight='balanced')
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3, 4],
    'random_state': [0, 42]
}
grid_search = GridSearchCV(dtree, param_grid, cv=5)
grid_search.fit(X_train, y_train)
Ctree = DecisionTreeClassifier(**grid_search.best_params_, class_weight='balanced')
Ctree.fit(X_train, y_train)
dtc_pred = Ctree.predict(X_test)
print("DecisionTrees's Accuracy: ", accuracy_score(y_test, dtc_pred))

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
Ks = 80
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
for n in range(1,Ks):
    #Train Model and Predict
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = accuracy_score(y_test, yhat)

    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

best_K = mean_acc.argmax()+1
knn = KNeighborsClassifier(n_neighbors = best_K).fit(X_train,y_train)
knn_pred=neigh.predict(X_test)
print( "The best accuracy was ", accuracy_score(y_test, knn_pred), "with k=", mean_acc.argmax()+1)

In [None]:
# random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rfc = RandomForestClassifier()
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
grid_search = GridSearchCV(rfc, param_grid)
grid_search.fit(X_train, y_train)
rfctree = RandomForestClassifier(**grid_search.best_params_)
rfctree.fit(X_train, y_train)
rfc_pred = rfctree.predict(X_test)
print("RandomForestClassifier's Accuracy: ", accuracy_score(y_test, rfc_pred))

In [None]:
# Model Evolution
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,confusion_matrix

def print_score(test, pred, model):
    fig, ax = plt.subplots(1,2,figsize=(15, 5))
    sns.heatmap(confusion_matrix(test, pred), annot=True, cmap='Blues', ax=ax[0])
    plt.xlabel('Predicted Values')
    plt.ylabel('Actual Values')
    ax = sns.distplot(test, color='r',  label='Actual Value',hist=False)
    sns.distplot(pred, color='b', label='Predicted Value',hist=False,ax=ax)
    plt.title(f'Actual vs Predicted Value {model}')
    plt.xlabel('Outcome')
    plt.ylabel('Count')
    plt.show()
    Metrics = {'Metrics':['Accuracy Score', 'f1 Score', 'Mean Absolute Error ','Mean Squared Error', 'R2 Score'],
               'Score' : [accuracy_score(test, pred), f1_score(test, pred, average="weighted"),
                          mean_absolute_error(test, pred),mean_squared_error(test, pred),r2_score(test, pred)]}
    df = pd.DataFrame(Metrics)
    return df

In [None]:
#comparing the accuracy of different models
models = ['Logistic Regression','SVM','DecisionTree','KNN', 'RandomForestClassifier']
preds = [lr_pred,svm_pred,dtc_pred,knn_pred,rfc_pred]
accuracys= []
for i in preds:
    accuracys.append( accuracy_score(y_test, i))
sns.barplot(x=models, y=accuracys)
plt.xlabel('Classifier Models')
plt.ylabel('Accuracy')
plt.title('Comparison of different models')
plt.show()

After conducting an extensive exploratory data analysis, several key factors were identified as significantly influencing the risk of heart disease:

* MaxHR (Maximum Heart Rate)
* Oldpeak (ST Depression)
* Chest Pain Type
* Exercise-Induced Angina
* ST Segment Slope

Among the various models tested, the Support Vector Machine (SVM) emerged as the most effective, achieving an accuracy of 84%. This model outperformed others in the analysis, demonstrating its strong predictive capabilities.

However, it's important to note that the dataset used for this analysis was relatively small, consisting of only 918 rows. This limited sample size may have constrained the model's performance. Expanding the dataset is crucial for improving the robustness and accuracy of the model. A larger dataset would provide a more comprehensive view of heart disease and its influencing factors, leading to more accurate predictions and better overall model performance.