In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
print("Step 1: Loading the dataset...")
try:
    df = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
    print("Dataset loaded successfully from local path.")
except FileNotFoundError:
    print("Local file not found. Loading from a remote URL...")
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
    column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
    df = pd.read_csv(url, header=None, names=column_names)
    print("Dataset loaded successfully from remote URL.")

df.to_csv('data/heart_disease.csv', index=False)

print("\nInitial Dataset Info:")
df.info()
print("\nFirst 5 rows of the dataset:")
print(df.head())

In [None]:
print("\nStep 2: Handling missing values...")
df.replace('?', np.nan, inplace=True)

print("\nMissing values before handling:")
print(df.isnull().sum())

df['ca'] = pd.to_numeric(df['ca'])
df['thal'] = pd.to_numeric(df['thal'])

for col in ['ca', 'thal']:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)

print("\nMissing values after handling:")
print(df.isnull().sum())

In [None]:
print("\nStep 3: Performing one-hot encoding for categorical variables...")

categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("\nDataset shape after one-hot encoding:", df.shape)
print("\nFirst 5 rows of the dataset after encoding:")
print(df.head())

In [None]:
print("\nStep 4: Standardizing numerical features...")

from sklearn.preprocessing import StandardScaler

numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print("\nFirst 5 rows of the dataset after scaling:")
print(df.head())

In [None]:

print("\nStep 5: Conducting Exploratory Data Analysis (EDA)...")

plt.figure(figsize=(18, 15))
sns.heatmap(df.corr(), annot=True, cmap='viridis', fmt='.2f')
plt.title('Correlation Heatmap of Features', fontsize=20)
plt.show()

print("\nDisplaying histograms for original numerical features (before scaling)...")

try:
    df_eda = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
except FileNotFoundError:
    df_eda = pd.read_csv(url, header=None, names=column_names)
df_eda.replace('?', np.nan, inplace=True)
df_eda.dropna(inplace=True) 
for col in ['ca', 'thal']:
    df_eda[col] = pd.to_numeric(df_eda[col])

df_eda[numerical_cols].hist(bins=20, figsize=(15, 10), layout=(2, 3))
plt.suptitle('Histograms of Numerical Features')
plt.show()

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(x='target', y=col, data=df_eda)
    plt.title(f'{col} vs. Target')
plt.tight_layout()
plt.show()

print("\nEDA visualizations have been generated.")

print("\nConverting target to binary classification (0: No Disease, 1: Disease)...")
df['target'] = (df['target'] > 0).astype(int)

print("\nValue counts of the new binary target:")
print(df['target'].value_counts())

distribution = df['target'].value_counts(normalize=True) * 100
print("Target variable distribution:")
print(distribution)


print("\nData preprocessing and cleaning complete.")
df.to_csv('cleaned_heart_disease.csv', index=False)
print("Cleaned dataset saved to 'cleaned_heart_disease.csv'.")