In [None]:
# Install required packages 
!pip install kaggle
!pip install pandas numpy scikit-learn matplotlib seaborn

# Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("All libraries imported successfully!")

In [None]:
# Direct download from UCI repository
import pandas as pd

# Load the dataset directly from the corrected URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
# The dataset does not have a header row, and the columns are not named.
# We need to provide column names manually based on the dataset description.
column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]
df = pd.read_csv(url, names=column_names, na_values="?") # Handle missing values represented by '?'

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")

In [None]:
# df = pd.read_csv('heart.csv') or you can use the variable from alternative method

# 1. View first few rows
print("=== First 5 rows of the dataset ===")
print(df.head())
print("\n")

# 2. Check dataset info (data types, non-null counts)
print("=== Dataset Information ===")
print(df.info())
print("\n")

# 3. Summary statistics
print("=== Summary Statistics ===")
print(df.describe())
print("\n")

# 4. Check for missing values
print("=== Missing Values Check ===")
print(df.isnull().sum())
print("\n")

# 5. Check target variable distribution
print("=== Target Variable Distribution ===")
print(df['target'].value_counts())
print(f"Percentage with heart disease: {(df['target'].sum()/len(df)*100):.1f}%")

In [None]:
# Check for missing values again after loading with '?' as NA
print("=== Missing values after loading with na_values='?' ===")
print(df.isnull().sum())
print("\n")

# Handle missing values: Fill missing values in 'ca' and 'thal' with the mode
for col in ['ca', 'thal']:
    if df[col].isnull().any():
        mode_value = df[col].mode()[0]
        df[col].fillna(mode_value, inplace=True)
        print(f"Filled missing values in '{col}' with the mode: {mode_value}")

print("\n=== Missing values after handling ===")
print(df.isnull().sum())