In [None]:
# Install required packages 
!pip install kaggle
!pip install pandas numpy scikit-learn matplotlib seaborn

# Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("All libraries imported successfully!")

In [None]:
# Direct download from UCI repository
import pandas as pd

# Load the dataset directly from the corrected URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
# The dataset does not have a header row, and the columns are not named.
# We need to provide column names manually based on the dataset description.
column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]
df = pd.read_csv(url, names=column_names, na_values="?") # Handle missing values represented by '?'

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")

In [None]:
# df = pd.read_csv('heart.csv') or you can use the variable from alternative method

# 1. View first few rows
print("=== First 5 rows of the dataset ===")
print(df.head())
print("\n")

# 2. Check dataset info (data types, non-null counts)
print("=== Dataset Information ===")
print(df.info())
print("\n")

# 3. Summary statistics
print("=== Summary Statistics ===")
print(df.describe())
print("\n")

# 4. Check for missing values
print("=== Missing Values Check ===")
print(df.isnull().sum())
print("\n")

# 5. Check target variable distribution
print("=== Target Variable Distribution ===")
print(df['target'].value_counts())
print(f"Percentage with heart disease: {(df['target'].sum()/len(df)*100):.1f}%")

In [None]:
# Check for missing values again after loading with '?' as NA
print("=== Missing values after loading with na_values='?' ===")
print(df.isnull().sum())
print("\n")

# Handle missing values: Fill missing values in 'ca' and 'thal' with the mode
for col in ['ca', 'thal']:
    if df[col].isnull().any():
        mode_value = df[col].mode()[0]
        df[col].fillna(mode_value, inplace=True)
        print(f"Filled missing values in '{col}' with the mode: {mode_value}")

print("\n=== Missing values after handling ===")
print(df.isnull().sum())

In [None]:
# Converts columns to appropriate data types
# 'ca' and 'thal' are currently float due to missing values, convert them to int
for col in ['ca', 'thal', 'target']:
    df[col] = df[col].astype(int)

print("=== Data types after conversion ===")
print(df.info())

In [None]:
# Explore the distribution of categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_features):
    plt.subplot(3, 3, i + 1)
    df[col].value_counts().plot(kind='bar', color=sns.color_palette('viridis'))
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

In [None]:
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_features):
    plt.subplot(2, 3, i + 1)
    sns.histplot(df[col], kde=True, color=sns.color_palette('viridis')[i])
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Create a feature dictionary for reference
feature_dict = {
    'age': 'Age in years',
    'sex': 'Sex (1 = male, 0 = female)',
    'cp': 'Chest pain type (0-3)',
    'trestbps': 'Resting blood pressure (mm Hg)',
    'chol': 'Serum cholesterol (mg/dl)',
    'fbs': 'Fasting blood sugar > 120 mg/dl (1 = true, 0 = false)',
    'restecg': 'Resting ECG results (0-2)',
    'thalach': 'Maximum heart rate achieved',
    'exang': 'Exercise induced angina (1 = yes, 0 = no)',
    'oldpeak': 'ST depression induced by exercise',
    'slope': 'Slope of peak exercise ST segment (0-2)',
    'ca': 'Number of major vessels colored by fluoroscopy (0-3)',
    'thal': 'Thalassemia (0 = normal, 1 = fixed defect, 2 = reversable defect)',
    'target': 'Heart disease presence (1 = yes, 0 = no)'
}

print("=== Feature Descriptions ===")
for feature, description in feature_dict.items():
    print(f"{feature}: {description}")

A solution: install kaggle package first

In [None]:
!pip install --upgrade kaggle
!kaggle datasets download -v -d ronitf/heart-disease-uci --force

In [None]:
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
    print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')

In [None]:
# Create a simple visualization to understand the data
plt.figure(figsize=(12, 5))

# Subplot 1: Target distribution
plt.subplot(1, 2, 1)
df['target'].value_counts().plot(kind='bar', color=['lightcoral', 'lightblue'])
plt.title('Heart Disease Distribution')
plt.xlabel('Target (0=No Disease, 1=Disease)')
plt.ylabel('Count')
plt.xticks(rotation=0)

# Subplot 2: Age distribution by target
plt.subplot(1, 2, 2)
df[df['target']==0]['age'].hist(alpha=0.7, label='No Disease', color='lightblue')
df[df['target']==1]['age'].hist(alpha=0.7, label='Disease', color='lightcoral')
plt.title('Age Distribution by Heart Disease Status')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend()

plt.tight_layout()
plt.show()

Task 2 Starts: Data preprocessing => cleaning and preparing dataset 

In [None]:
# data quality assessment
print("=== DATA QUALITY ASSESSMENT ===\n")

# 1. Check dataset shape
print(f"Dataset shape: {df.shape}")
print(f"Number of patients: {df.shape[0]}")
print(f"Number of features: {df.shape[1] - 1} (excluding target)")

# 2. Check for duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")
if duplicates > 0:
    print("Removing duplicates...")
    df = df.drop_duplicates()
    print(f"New shape after removing duplicates: {df.shape}")

# 3. Check for missing values (detailed)
print("\n=== Missing Values Analysis ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_table = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percent
})
print(missing_table[missing_table['Missing Count'] > 0])

# 4. Check data types
print("\n=== Data Types Check ===")
print(df.dtypes)