In [None]:
# Install required packages 
!pip install kaggle
!pip install pandas numpy scikit-learn matplotlib seaborn

# Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("All libraries imported successfully!")

In [None]:
# Direct download from UCI repository
import pandas as pd

# Load the dataset directly from the corrected URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
# The dataset does not have a header row, and the columns are not named.
# We need to provide column names manually based on the dataset description.
column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]
df = pd.read_csv(url, names=column_names, na_values="?") # Handle missing values represented by '?'

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")

In [None]:
# df = pd.read_csv('heart.csv') or you can use the variable from alternative method

# 1. View first few rows
print("=== First 5 rows of the dataset ===")
print(df.head())
print("\n")

# 2. Check dataset info (data types, non-null counts)
print("=== Dataset Information ===")
print(df.info())
print("\n")

# 3. Summary statistics
print("=== Summary Statistics ===")
print(df.describe())
print("\n")

# 4. Check for missing values
print("=== Missing Values Check ===")
print(df.isnull().sum())
print("\n")

# 5. Check target variable distribution
print("=== Target Variable Distribution ===")
print(df['target'].value_counts())
print(f"Percentage with heart disease: {(df['target'].sum()/len(df)*100):.1f}%")

In [None]:
# Check for missing values again after loading with '?' as NA
print("=== Missing values after loading with na_values='?' ===")
print(df.isnull().sum())
print("\n")

# Handle missing values: Fill missing values in 'ca' and 'thal' with the mode
for col in ['ca', 'thal']:
    if df[col].isnull().any():
        mode_value = df[col].mode()[0]
        df[col].fillna(mode_value, inplace=True)
        print(f"Filled missing values in '{col}' with the mode: {mode_value}")

print("\n=== Missing values after handling ===")
print(df.isnull().sum())

In [None]:
# Converts columns to appropriate data types
# 'ca' and 'thal' are currently float due to missing values, convert them to int
for col in ['ca', 'thal', 'target']:
    df[col] = df[col].astype(int)

print("=== Data types after conversion ===")
print(df.info())

In [None]:
# Explore the distribution of categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_features):
    plt.subplot(3, 3, i + 1)
    df[col].value_counts().plot(kind='bar', color=sns.color_palette('viridis'))
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

In [None]:
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_features):
    plt.subplot(2, 3, i + 1)
    sns.histplot(df[col], kde=True, color=sns.color_palette('viridis')[i])
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Create a feature dictionary for reference
feature_dict = {
    'age': 'Age in years',
    'sex': 'Sex (1 = male, 0 = female)',
    'cp': 'Chest pain type (0-3)',
    'trestbps': 'Resting blood pressure (mm Hg)',
    'chol': 'Serum cholesterol (mg/dl)',
    'fbs': 'Fasting blood sugar > 120 mg/dl (1 = true, 0 = false)',
    'restecg': 'Resting ECG results (0-2)',
    'thalach': 'Maximum heart rate achieved',
    'exang': 'Exercise induced angina (1 = yes, 0 = no)',
    'oldpeak': 'ST depression induced by exercise',
    'slope': 'Slope of peak exercise ST segment (0-2)',
    'ca': 'Number of major vessels colored by fluoroscopy (0-3)',
    'thal': 'Thalassemia (0 = normal, 1 = fixed defect, 2 = reversable defect)',
    'target': 'Heart disease presence (1 = yes, 0 = no)'
}

print("=== Feature Descriptions ===")
for feature, description in feature_dict.items():
    print(f"{feature}: {description}")

A solution: install kaggle package first

In [None]:
!pip install --upgrade kaggle
!kaggle datasets download -v -d ronitf/heart-disease-uci --force

In [None]:
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
    print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')

In [None]:
# Create a simple visualization to understand the data
plt.figure(figsize=(12, 5))

# Subplot 1: Target distribution
plt.subplot(1, 2, 1)
df['target'].value_counts().plot(kind='bar', color=['lightcoral', 'lightblue'])
plt.title('Heart Disease Distribution')
plt.xlabel('Target (0=No Disease, 1=Disease)')
plt.ylabel('Count')
plt.xticks(rotation=0)

# Subplot 2: Age distribution by target
plt.subplot(1, 2, 2)
df[df['target']==0]['age'].hist(alpha=0.7, label='No Disease', color='lightblue')
df[df['target']==1]['age'].hist(alpha=0.7, label='Disease', color='lightcoral')
plt.title('Age Distribution by Heart Disease Status')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend()

plt.tight_layout()
plt.show()

Task 2 Starts: Data preprocessing => cleaning and preparing dataset 

In [None]:
# data quality assessment
print("=== DATA QUALITY ASSESSMENT ===\n")

# 1. Check dataset shape
print(f"Dataset shape: {df.shape}")
print(f"Number of patients: {df.shape[0]}")
print(f"Number of features: {df.shape[1] - 1} (excluding target)")

# 2. Check for duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")
if duplicates > 0:
    print("Removing duplicates...")
    df = df.drop_duplicates()
    print(f"New shape after removing duplicates: {df.shape}")

# 3. Check for missing values (detailed)
print("\n=== Missing Values Analysis ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_table = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percent
})
print(missing_table[missing_table['Missing Count'] > 0])

# 4. Check data types
print("\n=== Data Types Check ===")
print(df.dtypes)

In [None]:
print("=== FEATURE ANALYSIS ===\n")

# Create a detailed feature analysis
for column in df.columns:
    if column != 'target':
        print(f"\n--- {column.upper()} ---")
        print(f"Data type: {df[column].dtype}")
        print(f"Unique values: {df[column].nunique()}")
        print(f"Min: {df[column].min()}, Max: {df[column].max()}")
        print(f"Value counts:\n{df[column].value_counts().head(10)}")

# Even though UCI dataset typically has no missing values, 
# here's how to handle them if they exist

In [None]:
def handle_missing_values(df):
    """Comprehensive missing value handling"""
    print("=== HANDLING MISSING VALUES ===")
    
    # Check for missing values again
    missing = df.isnull().sum()
    
    if missing.sum() == 0:
        print("No missing values found!")
        return df
    
    # Strategy depends on the feature type
    for column in df.columns:
        if df[column].isnull().sum() > 0:
            print(f"\nHandling missing values in {column}:")
            
            # For numerical features
            if df[column].dtype in ['int64', 'float64']:
                # Use median (more robust than mean)
                median_value = df[column].median()
                df[column].fillna(median_value, inplace=True)
                print(f"  - Filled {df[column].isnull().sum()} missing values with median: {median_value}")
            
            # For categorical features
            else:
                # Use mode (most frequent value)
                mode_value = df[column].mode()[0]
                df[column].fillna(mode_value, inplace=True)
                print(f"  - Filled {df[column].isnull().sum()} missing values with mode: {mode_value}")
    
    return df

# Apply the function
df = handle_missing_values(df)

# Outlier detection and handling 
# Detect outliers using statistical methods

In [None]:

def detect_outliers(df, features):
    """Detect outliers using IQR method"""
    print("=== OUTLIER DETECTION ===")
    
    outlier_indices = []
    
    for feature in features:
        if feature != 'target' and df[feature].dtype in ['int64', 'float64']:
            # Calculate IQR
            Q1 = df[feature].quantile(0.25)
            Q3 = df[feature].quantile(0.75)
            IQR = Q3 - Q1
            
            # Define outlier bounds
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Find outliers
            outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
            
            if len(outliers) > 0:
                print(f"\n{feature}:")
                print(f"  - Lower bound: {lower_bound:.2f}")
                print(f"  - Upper bound: {upper_bound:.2f}")
                print(f"  - Outliers found: {len(outliers)} ({len(outliers)/len(df)*100:.1f}%)")
                
                # Visualize outliers
                plt.figure(figsize=(10, 4))
                
                plt.subplot(1, 2, 1)
                df[feature].hist(bins=30, alpha=0.7)
                plt.axvline(lower_bound, color='red', linestyle='--', label=f'Lower bound: {lower_bound:.1f}')
                plt.axvline(upper_bound, color='red', linestyle='--', label=f'Upper bound: {upper_bound:.1f}')
                plt.title(f'{feature} Distribution with Outlier Bounds')
                plt.legend()
                
                plt.subplot(1, 2, 2)
                df.boxplot(column=feature)
                plt.title(f'{feature} Boxplot')
                
                plt.tight_layout()
                plt.show()
    
    return outlier_indices

# Apply outlier detection
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
detect_outliers(df, numerical_features)

# Feature engineering and selection
# Create new features that might be useful

In [None]:

print("=== FEATURE ENGINEERING ===")

# 1. Age groups
df['age_group'] = pd.cut(df['age'], 
                        bins=[0, 40, 50, 60, 70, 100], 
                        labels=['<40', '40-50', '50-60', '60-70', '70+'])

# 2. Cholesterol categories (using medical standards)
df['chol_category'] = pd.cut(df['chol'], 
                            bins=[0, 200, 240, 1000], 
                            labels=['Desirable', 'Borderline', 'High'])

# 3. Blood pressure categories
df['bp_category'] = pd.cut(df['trestbps'], 
                          bins=[0, 120, 130, 140, 180, 300], 
                          labels=['Normal', 'Elevated', 'Stage1', 'Stage2', 'Crisis'])

# 4. Heart rate efficiency (thalach vs age)
df['heart_rate_efficiency'] = df['thalach'] / df['age']

print("New features created:")
print("- age_group: Categorized age ranges")
print("- chol_category: Cholesterol levels based on medical standards")
print("- bp_category: Blood pressure categories")
print("- heart_rate_efficiency: Ratio of max heart rate to age")

# Data normalization and scalling
# Separate features and target


In [None]:
X = df.drop('target', axis=1)
y = df['target']

print("=== FEATURE SCALING ===")

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Handle categorical features (if any)
if categorical_features:
    X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
    print(f"After one-hot encoding: {X.shape[1]} features")

# Scale numerical features
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numerical_features] = scaler.fit_transform(X[numerical_features])

print("\nScaling completed!")
print(f"Original feature range example (age): {X['age'].min():.1f} to {X['age'].max():.1f}")
print(f"Scaled feature range example (age): {X_scaled['age'].min():.2f} to {X_scaled['age'].max():.2f}")

In [None]:
# Train-test split
from sklearn.model_selection import train_test_split

print("=== TRAIN-TEST SPLIT ===")

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training set distribution: {y_train.value_counts().to_dict()}")
print(f"Test set distribution: {y_test.value_counts().to_dict()}")

# Verify the split maintains the class distribution
train_ratio = y_train.sum() / len(y_train)
test_ratio = y_test.sum() / len(y_test)
print(f"\nClass balance check:")
print(f"Training set positive ratio: {train_ratio:.3f}")
print(f"Test set positive ratio: {test_ratio:.3f}")

# Final Data verification 
# Create a comprehensive verification report


In [None]:
print("=== FINAL VERIFICATION ===\n")

def verify_preprocessing(X_train, X_test, y_train, y_test):
    """Verify that preprocessing was successful"""
    
    # 1. Check shapes
    print("1. Shape Verification:")
    print(f"   X_train: {X_train.shape}")
    print(f"   X_test: {X_test.shape}")
    print(f"   y_train: {y_train.shape}")
    print(f"   y_test: {y_test.shape}")
    
    # 2. Check for missing values
    print("\n2. Missing Values Check:")
    print(f"   X_train missing: {X_train.isnull().sum().sum()}")
    print(f"   X_test missing: {X_test.isnull().sum().sum()}")
    
    # 3. Check scaling
    print("\n3. Scaling Verification (first 3 numerical features):")
    numerical_cols = X_train.select_dtypes(include=[np.number]).columns[:3]
    for col in numerical_cols:
        print(f"   {col}: mean={X_train[col].mean():.3f}, std={X_train[col].std():.3f}")
    
    # 4. Check target distribution
    print("\n4. Target Distribution:")
    print(f"   Training: {y_train.value_counts().to_dict()}")
    print(f"   Testing: {y_test.value_counts().to_dict()}")
    
    print("\nâœ… Preprocessing completed successfully!")

# Run verification
verify_preprocessing(X_train, X_test, y_train, y_test)


In [None]:
import pickle

# Save the scaler for later use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the preprocessed data
preprocessed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'feature_names': X_train.columns.tolist()
}

with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)

print("Preprocessed data saved!")
print("Files created: scaler.pkl, preprocessed_data.pkl")

# Summary Visualization


In [None]:
import pickle

# Save the scaler for later use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the preprocessed data
preprocessed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'feature_names': X_train.columns.tolist()
}

with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)

print("Preprocessed data saved!")
print("Files created: scaler.pkl, preprocessed_data.pkl")