In [None]:
# Install required packages
%pip install openml 

%pip install pandas 

%pip install scikit-learn 

%pip install matplotlib 

%pip install seaborn

In [None]:
import openml
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, chi2_contingency, f_oneway
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier  # Added XGBoost


# Define dataset ID
dataset_id = 45547
csv_filename = "cardiovascular_disease_dataset.csv"

try:
    # Check if dataset exists locally to avoid re-downloading
    if os.path.exists(csv_filename):
        print("Loading dataset from local CSV...")
        df = pd.read_csv(csv_filename)
    else:
        print("Fetching dataset from OpenML...")
        dataset = openml.datasets.get_dataset(dataset_id)
        df, _, _, _ = dataset.get_data()
        df.to_csv(csv_filename, index=False)  # Save locally

    # Display first few rows
    print("\nDataset Preview:")
    print(df.head())

    # Show column names for verification
    print("\nDataset Columns:")
    print(df.columns)

    # Check for missing values
    print("\nMissing Values Summary:")
    print(df.isnull().sum())

    # **Data Cleaning and Corrections**
    # Convert age from days to years
    df['age'] = df['age'] // 365  

    # Impute missing values using median strategy (Updated to avoid FutureWarning)
    df = df.fillna(df.median())

    # Convert categorical column for efficiency
    df = df.astype({"cardio": "int8"})  

    # **Feature Engineering**
    df['bmi'] = df['weight'] / (df['height'] / 100) ** 2
    df['blood_pressure_ratio'] = df['ap_hi'] / df['ap_lo']

    # Filter out unrealistic blood pressure values
    df = df[(df['ap_hi'] > 50) & (df['ap_hi'] < 250)]
    df = df[(df['ap_lo'] > 30) & (df['ap_lo'] < 180)]

    # Filter out unrealistic height and weight values
    df = df[(df['height'] > 100) & (df['height'] < 230)]
    df = df[(df['weight'] > 30) & (df['weight'] < 150)]

    # Display basic statistics of numerical features
    print("\nDataset Summary Statistics:")
    print(df.describe())

    # **Exploratory Data Analysis (EDA)**
    sns.set_style("whitegrid")

    # Histogram for Age Distribution
    plt.figure(figsize=(10, 5))
    sns.histplot(df['age'], bins=30, kde=True)
    plt.title("Age Distribution")
    plt.xlabel("Age")
    plt.ylabel("Count")
    plt.show()

    # Correlation Heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.corr(), annot=True, cmap="coolwarm", linewidths=0.5)
    plt.title("Feature Correlation Matrix")
    plt.show()

    # Boxplot for Blood Pressure vs. Cardiovascular Disease
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=df['cardio'], y=df['ap_hi'], hue=df['cardio'], palette="Set2", legend=False)
    plt.title("Systolic Blood Pressure vs. Cardiovascular Disease")
    plt.xlabel("Cardiovascular Disease (0 = No, 1 = Yes)")
    plt.ylabel("Systolic Blood Pressure (ap_hi)")
    plt.show()

    # **Machine Learning Models**
    X = df.drop(columns=['cardio'])
    y = df['cardio']

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Train models
    models = {
        "Logistic Regression": LogisticRegression(),
        "Random Forest": RandomForestClassifier(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "XGBoost": XGBClassifier()
    }

    for name, model in models.items():
        model.fit(X_train, y_train)
        accuracy = model.score(X_test, y_test)
        print(f"{name} Accuracy: {accuracy:.4f}")

except Exception as e:
    print(f"\nError fetching dataset: {e}")




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif

# **Hyperparameter Tuning using GridSearchCV**
param_grid = {
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10]
    },
    "Gradient Boosting": {
        "learning_rate": [0.01, 0.1, 0.2],
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7]
    },
    "XGBoost": {
        "learning_rate": [0.01, 0.1, 0.2],
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7]
    }
}

best_models = {}
for name, params in param_grid.items():
    grid_search = GridSearchCV(models[name], params, cv=5, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

# **Feature Selection using SelectKBest**
feature_selector = SelectKBest(score_func=f_classif, k=5)
X_selected = feature_selector.fit_transform(X_train, y_train)

print("\nSelected top 5 features:", X.columns[feature_selector.get_support()])