# Class For DecisionTree

In [27]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report)
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


class CardioDecisionTreeClassifier:
    """
    A reusable Decision Tree classifier for cardiovascular disease prediction.
    
    Features:
    - Train/test split with customizable ratio
    - Optional feature scaling
    - Comprehensive performance metrics
    - Export predictions to CSV
    """
    
    def __init__(self, test_size=0.2, random_state=42, max_depth=None,
                 min_samples_split=2, min_samples_leaf=1, scale_features=True,
                 target_col='cardio', drop_cols=None):
        """
        Initialize the classifier.
        
        Parameters:
        -----------
        test_size : float, default=0.2
            Proportion of dataset to include in test split
        random_state : int, default=42
            Random state for reproducibility
        max_depth : int, default=None
            Maximum depth of the tree
        min_samples_split : int, default=2
            Minimum samples required to split a node
        min_samples_leaf : int, default=1
            Minimum samples required at a leaf node
        scale_features : bool, default=True
            Whether to scale features
        target_col : str, default='cardio'
            Name of the target variable column
        drop_cols : list, default=None
            List of columns to drop (e.g., ID columns)
        """
        
        self.test_size = test_size
        self.random_state = random_state
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.scale_features = scale_features
        self.target_col = target_col
        self.drop_cols = drop_cols if drop_cols else []
        
        self.model = None
        self.scaler = None
        self.data = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.feature_names = None
        self.metrics = {}
        
    def load_data(self, filepath):
        """
        Load data from CSV file.
        
        Parameters:
        -----------
        filepath : str
            Path to the CSV file
            
        Returns:
        --------
        pd.DataFrame : Loaded dataframe
        """
        self.data = pd.read_csv(filepath)
        print(f"Data loaded successfully: {self.data.shape[0]} rows, {self.data.shape[1]} columns")
        return self.data
    
    def preprocess_data(self):
        """
        Preprocess the data for training.
        Uses `self.target_col` and `self.drop_cols` from initialization.
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
        
        df = self.data.copy()
        if self.drop_cols:
            df = df.drop(columns=self.drop_cols, errors='ignore')
        
        X = df.drop(columns=[self.target_col])
        y = df[self.target_col]
        
        self.feature_names = X.columns.tolist()
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        
        if self.scale_features:
            self.scaler = StandardScaler()
            self.X_train = pd.DataFrame(
                self.scaler.fit_transform(self.X_train),
                columns=self.feature_names,
                index=self.X_train.index
            )
            self.X_test = pd.DataFrame(
                self.scaler.transform(self.X_test),
                columns=self.feature_names,
                index=self.X_test.index
            )
        
        print(f"Data preprocessed: Train size = {len(self.X_train)}, Test size = {len(self.X_test)}")
    
    def train(self):
        """Train the Decision Tree classifier."""
        if self.X_train is None:
            raise ValueError("Data not preprocessed. Call preprocess_data() first.")
        
        self.model = DecisionTreeClassifier(
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
        )
        
        self.model.fit(self.X_train, self.y_train)
        print("Model trained successfully!")
    
    def predict(self, X=None):
        """
        Make predictions on data.
        
        Parameters:
        -----------
        X : pd.DataFrame, default=None
            Data to predict. If None, uses test set.
            
        Returns:
        --------
        np.ndarray : Predictions
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        if X is None:
            X_to_predict = self.X_test
        else:
            X_to_predict = X

        if X is not None and self.scale_features and self.scaler is not None:
            X_to_predict = pd.DataFrame(
                self.scaler.transform(X_to_predict),
                columns=X_to_predict.columns,
                index=X_to_predict.index
            )
        
        return self.model.predict(X_to_predict)
    
    def evaluate(self):
        """
        Evaluate model performance on test set.
        
        Returns:
        --------
        dict : Dictionary containing all performance metrics
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        y_pred = self.predict()
        
        self.metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred, average='binary'),
            'recall': recall_score(self.y_test, y_pred, average='binary'),
            'f1_score': f1_score(self.y_test, y_pred, average='binary'),
            'confusion_matrix': confusion_matrix(self.y_test, y_pred)
        }
        
        print("\n" + "="*50)
        print("MODEL PERFORMANCE METRICS")
        print("="*50)
        print(f"Accuracy:  {self.metrics['accuracy']:.4f}")
        print(f"Precision: {self.metrics['precision']:.4f}")
        print(f"Recall:    {self.metrics['recall']:.4f}")
        print(f"F1-Score:  {self.metrics['f1_score']:.4f}")
        print("\nConfusion Matrix:")
        print(self.metrics['confusion_matrix'])
        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred, 
                                      target_names=['No Disease', 'Disease']))
        print("="*50 + "\n")
        
        return self.metrics
    
    def export_predictions(self, output_filepath, include_all_data=True):
        """
        Export predictions for the *entire* loaded dataset to a new CSV file.
        
        Parameters:
        -----------
        output_filepath : str
            Path for the output CSV file
        include_all_data : bool, default=True
            If True, includes all original columns. If False, only includes
            the target and predicted columns.
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
            
        
        # Prepare the full dataset for prediction
        X_full = self.data.copy()
        
        if self.drop_cols:
            X_full = X_full.drop(columns=self.drop_cols, errors='ignore')
            
        X_full = X_full.drop(columns=[self.target_col], errors='ignore')
        
        # Ensure columns match training data
        X_full = X_full[self.feature_names]
        
        if self.scale_features and self.scaler is not None:
            X_full_scaled = pd.DataFrame(
                self.scaler.transform(X_full),
                columns=self.feature_names,
                index=X_full.index
            )
            predictions = self.model.predict(X_full_scaled)
        else:
            predictions = self.model.predict(X_full)
        
        if include_all_data:
            output_df = self.data.copy()
        else:
            output_df = pd.DataFrame()
            output_df[self.target_col] = self.data[self.target_col]
        
        output_df['predicted'] = predictions
        
        output_df.to_csv(output_filepath, index=False)
        print(f"Predictions exported to: {output_filepath}")
        
        return output_df
    
    def run_full_pipeline(self, input_filepath, output_filepath, include_all_data=True):
        """
        Run the complete pipeline: load, preprocess, train, evaluate, and export.
        
        Parameters:
        -----------
        input_filepath : str
            Path to input CSV file
        output_filepath : str
            Path for output CSV file with predictions
        include_all_data : bool, default=True
            Passed to export_predictions to control output file content.
            
        Returns:
        --------
        dict : Performance metrics
        """
        print("Starting full pipeline...\n")
        
        self.load_data(input_filepath)
        self.preprocess_data()
        self.train()
        metrics = self.evaluate()
        self.export_predictions(output_filepath, include_all_data=include_all_data)
        
        print("Pipeline completed successfully!")
        return metrics


# Example usage
if __name__ == "__main__":
    # Configure the model with all parameters during initialization
    clf = CardioDecisionTreeClassifier(
        test_size=0.2,
        random_state=42,
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        scale_features=True,
        target_col='cardio',  # <--- Parameter moved here
        drop_cols=['id']      # <--- Parameter moved here
    )
    
    # Run the pipeline with just the filepaths
    metrics = clf.run_full_pipeline(
        input_filepath='train.csv',
        output_filepath='cardio_train_pred.csv'
    )


Starting full pipeline...

Data loaded successfully: 68520 rows, 13 columns
Data preprocessed: Train size = 54816, Test size = 13704
Model trained successfully!

MODEL PERFORMANCE METRICS
Accuracy:  0.7211
Precision: 0.7326
Recall:    0.6865
F1-Score:  0.7088

Confusion Matrix:
[[5231 1698]
 [2124 4651]]

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.71      0.75      0.73      6929
     Disease       0.73      0.69      0.71      6775

    accuracy                           0.72     13704
   macro avg       0.72      0.72      0.72     13704
weighted avg       0.72      0.72      0.72     13704


Predictions exported to: cardio_train_pred.csv
Pipeline completed successfully!


# Logistic Regression


In [32]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


class CardioLogisticRegressionClassifier:
    """
    A reusable Logistic Regression classifier for cardiovascular disease prediction.
    
    Features:
    - Train/test split with customizable ratio
    - Optional feature scaling
    - Comprehensive performance metrics
    - Export predictions to CSV
    """
    
    def __init__(self, test_size=0.2, random_state=42, C=1.0, 
                 solver='lbfgs', max_iter=100, scale_features=True,
                 target_col='cardio', drop_cols=None):
        """
        Initialize the classifier.
        
        Parameters:
        -----------
        test_size : float, default=0.2
            Proportion of dataset to include in test split
        random_state : int, default=42
            Random state for reproducibility
        C : float, default=1.0
            Inverse of regularization strength; smaller values specify stronger regularization
        solver : str, default='lbfgs'
            Algorithm to use in the optimization problem
        max_iter : int, default=100
            Maximum number of iterations taken for the solvers to converge
        scale_features : bool, default=True
            Whether to scale features
        target_col : str, default='cardio'
            Name of the target variable column
        drop_cols : list, default=None
            List of columns to drop (e.g., ID columns)
        """

        self.test_size = test_size
        self.random_state = random_state
        self.C = C
        self.solver = solver
        self.max_iter = max_iter
        self.scale_features = scale_features
        self.target_col = target_col
        self.drop_cols = drop_cols if drop_cols else []
        
        self.model = None
        self.scaler = None
        self.data = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.feature_names = None
        self.metrics = {}
        
    def load_data(self, filepath):
        """
        Load data from CSV file.
        
        Parameters:
        -----------
        filepath : str
            Path to the CSV file
            
        Returns:
        --------
        pd.DataFrame : Loaded dataframe
        """
        self.data = pd.read_csv(filepath)
        print(f"Data loaded successfully: {self.data.shape[0]} rows, {self.data.shape[1]} columns")
        return self.data
    
    def preprocess_data(self):
        """
        Preprocess the data for training.
        Uses `self.target_col` and `self.drop_cols` from initialization.
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
        
        df = self.data.copy()
        if self.drop_cols:
            df = df.drop(columns=self.drop_cols, errors='ignore')
        
        X = df.drop(columns=[self.target_col])
        y = df[self.target_col]
        
        self.feature_names = X.columns.tolist()
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        
        if self.scale_features:
            self.scaler = StandardScaler()
            self.X_train = pd.DataFrame(
                self.scaler.fit_transform(self.X_train),
                columns=self.feature_names,
                index=self.X_train.index
            )
            self.X_test = pd.DataFrame(
                self.scaler.transform(self.X_test),
                columns=self.feature_names,
                index=self.X_test.index
            )
        
        print(f"Data preprocessed: Train size = {len(self.X_train)}, Test size = {len(self.X_test)}")
    
    def train(self):
        """Train the Logistic Regression classifier."""
        if self.X_train is None:
            raise ValueError("Data not preprocessed. Call preprocess_data() first.")
        
        self.model = LogisticRegression(
            C=self.C,
            solver=self.solver,
            max_iter=self.max_iter,
            random_state=self.random_state
        )
        
        self.model.fit(self.X_train, self.y_train)
        print("Model trained successfully!")
    
    def predict(self, X=None):
        """
        Make predictions on data.
        
        Parameters:
        -----------
        X : pd.DataFrame, default=None
            Data to predict. If None, uses test set.
            
        Returns:
        --------
        np.ndarray : Predictions
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        if X is None:
            X_to_predict = self.X_test
        else:
            X_to_predict = X

        if X is not None and self.scale_features and self.scaler is not None:
            # When predicting on new, external data
            X_to_predict = pd.DataFrame(
                self.scaler.transform(X_to_predict),
                columns=X_to_predict.columns,
                index=X_to_predict.index
            )
        
        return self.model.predict(X_to_predict)
    
    def evaluate(self):
        """
        Evaluate model performance on test set.
        
        Returns:
        --------
        dict : Dictionary containing all performance metrics
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        y_pred = self.predict()
        
        self.metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred, average='binary'),
            'recall': recall_score(self.y_test, y_pred, average='binary'),
            'f1_score': f1_score(self.y_test, y_pred, average='binary'),
            'confusion_matrix': confusion_matrix(self.y_test, y_pred)
        }
        
        print("\n" + "="*50)
        print("MODEL PERFORMANCE METRICS")
        print("="*50)
        print(f"Accuracy:  {self.metrics['accuracy']:.4f}")
        print(f"Precision: {self.metrics['precision']:.4f}")
        print(f"Recall:    {self.metrics['recall']:.4f}")
        print(f"F1-Score:  {self.metrics['f1_score']:.4f}")
        print("\nConfusion Matrix:")
        print(self.metrics['confusion_matrix'])
        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred, 
                                      target_names=['No Disease', 'Disease']))
        print("="*50 + "\n")
        
        return self.metrics
    
    def export_predictions(self, output_filepath, include_all_data=True):
        """
        Export predictions for the *entire* loaded dataset to a new CSV file.
        
        Parameters:
        -----------
        output_filepath : str
            Path for the output CSV file
        include_all_data : bool, default=True
            If True, includes all original columns. If False, only includes
            the target and predicted columns.
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
            
        
        # Prepare the full dataset for prediction
        X_full = self.data.copy()
        
        if self.drop_cols:
            X_full = X_full.drop(columns=self.drop_cols, errors='ignore')
            
        X_full = X_full.drop(columns=[self.target_col], errors='ignore')
        
        # Ensure columns match training data
        X_full = X_full[self.feature_names]
        
        if self.scale_features and self.scaler is not None:
            X_full_scaled = pd.DataFrame(
                self.scaler.transform(X_full),
                columns=self.feature_names,
                index=X_full.index
            )
            predictions = self.model.predict(X_full_scaled)
        else:
            predictions = self.model.predict(X_full)
        
        if include_all_data:
            output_df = self.data.copy()
        else:
            output_df = pd.DataFrame()
            output_df[self.target_col] = self.data[self.target_col]
        
        output_df['predicted'] = predictions
        
        output_df.to_csv(output_filepath, index=False)
        print(f"Predictions exported to: {output_filepath}")
        
        return output_df
    
    def run_full_pipeline(self, input_filepath, output_filepath, include_all_data=True):
        """
        Run the complete pipeline: load, preprocess, train, evaluate, and export.
        
        Parameters:
        -----------
        input_filepath : str
            Path to input CSV file
        output_filepath : str
            Path for output CSV file with predictions
        include_all_data : bool, default=True
            Passed to export_predictions to control output file content.
            
        Returns:
        --------
        dict : Performance metrics
        """
        print("Starting full pipeline...\n")
        
        self.load_data(input_filepath)
        self.preprocess_data()
        self.train()
        metrics = self.evaluate()
        self.export_predictions(output_filepath, include_all_data=include_all_data)
        
        print("Pipeline completed successfully!")
        return metrics


# Example usage
if __name__ == "__main__":
    # Configure the model with all parameters during initialization
    clf = CardioLogisticRegressionClassifier(
        test_size=0.3,
        random_state=12,
        C=1.0,
        solver='lbfgs',
        max_iter=10000, # Increased max_iter for convergence
        scale_features=True,
        target_col='cardio',  # <--- Parameter moved here
        drop_cols=['id']      # <--- Parameter moved here
    )
    
    # Run the pipeline with just the filepaths
    metrics = clf.run_full_pipeline(
        input_filepath='cardio_train.csv',
        output_filepath='cardio_train_pred_lr.csv' # Changed output name
    )


Starting full pipeline...

Data loaded successfully: 70000 rows, 13 columns
Data preprocessed: Train size = 49000, Test size = 21000
Model trained successfully!

MODEL PERFORMANCE METRICS
Accuracy:  0.7147
Precision: 0.7348
Recall:    0.6714
F1-Score:  0.7017

Confusion Matrix:
[[7963 2543]
 [3448 7046]]

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.70      0.76      0.73     10506
     Disease       0.73      0.67      0.70     10494

    accuracy                           0.71     21000
   macro avg       0.72      0.71      0.71     21000
weighted avg       0.72      0.71      0.71     21000


Predictions exported to: cardio_train_pred_lr.csv
Pipeline completed successfully!


# Random Forest


In [33]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


class CardioRandomForestClassifier:
    """
    A reusable Random Forest classifier for cardiovascular disease prediction.
    
    Features:
    - Train/test split with customizable ratio
    - Optional feature scaling
    - Comprehensive performance metrics
    - Export predictions to CSV
    """
    
    def __init__(self, test_size=0.2, random_state=42, n_estimators=100, 
                 max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                 scale_features=True, target_col='cardio', drop_cols=None):
        """
        Initialize the classifier.
        
        Parameters:
        -----------
        test_size : float, default=0.2
            Proportion of dataset to include in test split
        random_state : int, default=42
            Random state for reproducibility
        n_estimators : int, default=100
            Number of trees in the forest
        max_depth : int, default=None
            Maximum depth of the tree
        min_samples_split : int, default=2
            Minimum samples required to split a node
        min_samples_leaf : int, default=1
            Minimum samples required at a leaf node
        scale_features : bool, default=True
            Whether to scale features
        target_col : str, default='cardio'
            Name of the target variable column
        drop_cols : list, default=None
            List of columns to drop (e.g., ID columns)
        """

        self.test_size = test_size
        self.random_state = random_state
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.scale_features = scale_features
        self.target_col = target_col
        self.drop_cols = drop_cols if drop_cols else []
        
        self.model = None
        self.scaler = None
        self.data = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.feature_names = None
        self.metrics = {}
        
    def load_data(self, filepath):
        """
        Load data from CSV file.
        
        Parameters:
        -----------
        filepath : str
            Path to the CSV file
            
        Returns:
        --------
        pd.DataFrame : Loaded dataframe
        """
        self.data = pd.read_csv(filepath)
        print(f"Data loaded successfully: {self.data.shape[0]} rows, {self.data.shape[1]} columns")
        return self.data
    
    def preprocess_data(self):
        """
        Preprocess the data for training.
        Uses `self.target_col` and `self.drop_cols` from initialization.
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
        
        df = self.data.copy()
        if self.drop_cols:
            df = df.drop(columns=self.drop_cols, errors='ignore')
        
        X = df.drop(columns=[self.target_col])
        y = df[self.target_col]
        
        self.feature_names = X.columns.tolist()
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        
        if self.scale_features:
            self.scaler = StandardScaler()
            self.X_train = pd.DataFrame(
                self.scaler.fit_transform(self.X_train),
                columns=self.feature_names,
                index=self.X_train.index
            )
            self.X_test = pd.DataFrame(
                self.scaler.transform(self.X_test),
                columns=self.feature_names,
                index=self.X_test.index
            )
        
        print(f"Data preprocessed: Train size = {len(self.X_train)}, Test size = {len(self.X_test)}")
    
    def train(self):
        """Train the Random Forest classifier."""
        if self.X_train is None:
            raise ValueError("Data not preprocessed. Call preprocess_data() first.")
        
        self.model = RandomForestClassifier(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
        )
        
        self.model.fit(self.X_train, self.y_train)
        print("Model trained successfully!")
    
    def predict(self, X=None):
        """
        Make predictions on data.
        
        Parameters:
        -----------
        X : pd.DataFrame, default=None
            Data to predict. If None, uses test set.
            
        Returns:
        --------
        np.ndarray : Predictions
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        if X is None:
            X_to_predict = self.X_test
        else:
            X_to_predict = X

        if X is not None and self.scale_features and self.scaler is not None:
            # When predicting on new, external data
            X_to_predict = pd.DataFrame(
                self.scaler.transform(X_to_predict),
                columns=X_to_predict.columns,
                index=X_to_predict.index
            )
        
        return self.model.predict(X_to_predict)
    
    def evaluate(self):
        """
        Evaluate model performance on test set.
        
        Returns:
        --------
        dict : Dictionary containing all performance metrics
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        y_pred = self.predict()
        
        self.metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred, average='binary'),
            'recall': recall_score(self.y_test, y_pred, average='binary'),
            'f1_score': f1_score(self.y_test, y_pred, average='binary'),
            'confusion_matrix': confusion_matrix(self.y_test, y_pred)
        }
        
        print("\n" + "="*50)
        print("MODEL PERFORMANCE METRICS")
        print("="*50)
        print(f"Accuracy:  {self.metrics['accuracy']:.4f}")
        print(f"Precision: {self.metrics['precision']:.4f}")
        print(f"Recall:    {self.metrics['recall']:.4f}")
        print(f"F1-Score:  {self.metrics['f1_score']:.4f}")
        print("\nConfusion Matrix:")
        print(self.metrics['confusion_matrix'])
        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred, 
                                      target_names=['No Disease', 'Disease']))
        print("="*50 + "\n")
        
        return self.metrics
    
    def export_predictions(self, output_filepath, include_all_data=True):
        """
        Export predictions for the *entire* loaded dataset to a new CSV file.
        
        Parameters:
        -----------
        output_filepath : str
            Path for the output CSV file
        include_all_data : bool, default=True
            If True, includes all original columns. If False, only includes
            the target and predicted columns.
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
            
        
        # Prepare the full dataset for prediction
        X_full = self.data.copy()
        
        if self.drop_cols:
            X_full = X_full.drop(columns=self.drop_cols, errors='ignore')
            
        X_full = X_full.drop(columns=[self.target_col], errors='ignore')
        
        # Ensure columns match training data
        X_full = X_full[self.feature_names]
        
        if self.scale_features and self.scaler is not None:
            X_full_scaled = pd.DataFrame(
                self.scaler.transform(X_full),
                columns=self.feature_names,
                index=X_full.index
            )
            predictions = self.model.predict(X_full_scaled)
        else:
            predictions = self.model.predict(X_full)
        
        if include_all_data:
            output_df = self.data.copy()
        else:
            output_df = pd.DataFrame()
            output_df[self.target_col] = self.data[self.target_col]
        
        output_df['predicted'] = predictions
        
        output_df.to_csv(output_filepath, index=False)
        print(f"Predictions exported to: {output_filepath}")
        
        return output_df
    
    def run_full_pipeline(self, input_filepath, output_filepath, include_all_data=True):
        """
        Run the complete pipeline: load, preprocess, train, evaluate, and export.
        
        Parameters:
        -----------
        input_filepath : str
            Path to input CSV file
        output_filepath : str
            Path for output CSV file with predictions
        include_all_data : bool, default=True
            Passed to export_predictions to control output file content.
            
        Returns:
        --------
        dict : Performance metrics
        """
        print("Starting full pipeline...\n")
        
        self.load_data(input_filepath)
        self.preprocess_data()
        self.train()
        metrics = self.evaluate()
        self.export_predictions(output_filepath, include_all_data=include_all_data)
        
        print("Pipeline completed successfully!")
        return metrics


# Example usage
if __name__ == "__main__":
    # Configure the model with all parameters during initialization
    clf = CardioRandomForestClassifier(
        test_size=0.2,
        random_state=42,
        n_estimators=100,
        max_depth=1000,
        min_samples_split=20,
        min_samples_leaf=10,
        scale_features=True,
        target_col='cardio',  # <--- Parameter moved here
        drop_cols=['id']      # <--- Parameter moved here
    )
    
    # Run the pipeline with just the filepaths
    metrics = clf.run_full_pipeline(
        input_filepath='cardio_train.csv',
        output_filepath='cardio_train_pred_rf.csv' # Changed output name
    )


Starting full pipeline...

Data loaded successfully: 70000 rows, 13 columns
Data preprocessed: Train size = 56000, Test size = 14000
Model trained successfully!

MODEL PERFORMANCE METRICS
Accuracy:  0.7313
Precision: 0.7492
Recall:    0.6950
F1-Score:  0.7210

Confusion Matrix:
[[5376 1628]
 [2134 4862]]

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.72      0.77      0.74      7004
     Disease       0.75      0.69      0.72      6996

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000


Predictions exported to: cardio_train_pred_rf.csv
Pipeline completed successfully!


# Naive Base


In [35]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


class CardioNaiveBayesClassifier:
    """
    A reusable Naive Bayes classifier for cardiovascular disease prediction.
    
    Features:
    - Train/test split with customizable ratio
    - Optional feature scaling
    - Comprehensive performance metrics
    - Export predictions to CSV
    """
    
    def __init__(self, test_size=0.2, random_state=42, var_smoothing=1e-9, 
                 scale_features=True, target_col='cardio', drop_cols=None):
        """
        Initialize the classifier.
        
        Parameters:
        -----------
        test_size : float, default=0.2
            Proportion of dataset to include in test split
        random_state : int, default=42
            Random state for reproducibility
        var_smoothing : float, default=1e-9
            Portion of the largest variance of all features that is added to variances for calculation stability
        scale_features : bool, default=True
            Whether to scale features
        target_col : str, default='cardio'
            Name of the target variable column
        drop_cols : list, default=None
            List of columns to drop (e.g., ID columns)
        """

        self.test_size = test_size
        self.random_state = random_state
        self.var_smoothing = var_smoothing
        self.scale_features = scale_features
        self.target_col = target_col
        self.drop_cols = drop_cols if drop_cols else []
        
        self.model = None
        self.scaler = None
        self.data = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.feature_names = None
        self.metrics = {}
        
    def load_data(self, filepath):
        """
        Load data from CSV file.
        
        Parameters:
        -----------
        filepath : str
            Path to the CSV file
            
        Returns:
        --------
        pd.DataFrame : Loaded dataframe
        """
        self.data = pd.read_csv(filepath)
        print(f"Data loaded successfully: {self.data.shape[0]} rows, {self.data.shape[1]} columns")
        return self.data
    
    def preprocess_data(self):
        """
        Preprocess the data for training.
        Uses `self.target_col` and `self.drop_cols` from initialization.
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
        
        df = self.data.copy()
        if self.drop_cols:
            df = df.drop(columns=self.drop_cols, errors='ignore')
        
        X = df.drop(columns=[self.target_col])
        y = df[self.target_col]
        
        self.feature_names = X.columns.tolist()
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        
        if self.scale_features:
            self.scaler = StandardScaler()
            self.X_train = pd.DataFrame(
                self.scaler.fit_transform(self.X_train),
                columns=self.feature_names,
                index=self.X_train.index
            )
            self.X_test = pd.DataFrame(
                self.scaler.transform(self.X_test),
                columns=self.feature_names,
                index=self.X_test.index
            )
        
        print(f"Data preprocessed: Train size = {len(self.X_train)}, Test size = {len(self.X_test)}")
    
    def train(self):
        """Train the Naive Bayes classifier."""
        if self.X_train is None:
            raise ValueError("Data not preprocessed. Call preprocess_data() first.")
        
        self.model = GaussianNB(
            var_smoothing=self.var_smoothing
        )
        
        self.model.fit(self.X_train, self.y_train)
        print("Model trained successfully!")
    
    def predict(self, X=None):
        """
        Make predictions on data.
        
        Parameters:
        -----------
        X : pd.DataFrame, default=None
            Data to predict. If None, uses test set.
            
        Returns:
        --------
        np.ndarray : Predictions
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        if X is None:
            X_to_predict = self.X_test
        else:
            X_to_predict = X

        if X is not None and self.scale_features and self.scaler is not None:
            # When predicting on new, external data
            X_to_predict = pd.DataFrame(
                self.scaler.transform(X_to_predict),
                columns=X_to_predict.columns,
                index=X_to_predict.index
            )
        
        return self.model.predict(X_to_predict)
    
    def evaluate(self):
        """
        Evaluate model performance on test set.
        
        Returns:
        --------
        dict : Dictionary containing all performance metrics
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        y_pred = self.predict()
        
        self.metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred, average='binary'),
            'recall': recall_score(self.y_test, y_pred, average='binary'),
            'f1_score': f1_score(self.y_test, y_pred, average='binary'),
            'confusion_matrix': confusion_matrix(self.y_test, y_pred)
        }
        
        print("\n" + "="*50)
        print("MODEL PERFORMANCE METRICS")
        print("="*50)
        print(f"Accuracy:  {self.metrics['accuracy']:.4f}")
        print(f"Precision: {self.metrics['precision']:.4f}")
        print(f"Recall:    {self.metrics['recall']:.4f}")
        print(f"F1-Score:  {self.metrics['f1_score']:.4f}")
        print("\nConfusion Matrix:")
        print(self.metrics['confusion_matrix'])
        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred, 
                                      target_names=['No Disease', 'Disease']))
        print("="*50 + "\n")
        
        return self.metrics
    
    def export_predictions(self, output_filepath, include_all_data=True):
        """
        Export predictions for the *entire* loaded dataset to a new CSV file.
        
        Parameters:
        -----------
        output_filepath : str
            Path for the output CSV file
        include_all_data : bool, default=True
            If True, includes all original columns. If False, only includes
            the target and predicted columns.
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
            
        
        # Prepare the full dataset for prediction
        X_full = self.data.copy()
        
        if self.drop_cols:
            X_full = X_full.drop(columns=self.drop_cols, errors='ignore')
            
        X_full = X_full.drop(columns=[self.target_col], errors='ignore')
        
        # Ensure columns match training data
        X_full = X_full[self.feature_names]
        
        if self.scale_features and self.scaler is not None:
            X_full_scaled = pd.DataFrame(
                self.scaler.transform(X_full),
                columns=self.feature_names,
                index=X_full.index
            )
            predictions = self.model.predict(X_full_scaled)
        else:
            predictions = self.model.predict(X_full)
        
        if include_all_data:
            output_df = self.data.copy()
        else:
            output_df = pd.DataFrame()
            output_df[self.target_col] = self.data[self.target_col]
        
        output_df['predicted'] = predictions
        
        output_df.to_csv(output_filepath, index=False)
        print(f"Predictions exported to: {output_filepath}")
        
        return output_df
    
    def run_full_pipeline(self, input_filepath, output_filepath, include_all_data=True):
        """
        Run the complete pipeline: load, preprocess, train, evaluate, and export.
        
        Parameters:
        -----------
        input_filepath : str
            Path to input CSV file
        output_filepath : str
            Path for output CSV file with predictions
        include_all_data : bool, default=True
            Passed to export_predictions to control output file content.
            
        Returns:
        --------
        dict : Performance metrics
        """
        print("Starting full pipeline...\n")
        
        self.load_data(input_filepath)
        self.preprocess_data()
        self.train()
        metrics = self.evaluate()
        self.export_predictions(output_filepath, include_all_data=include_all_data)
        
        print("Pipeline completed successfully!")
        return metrics


# Example usage
if __name__ == "__main__":
    # Configure the model with all parameters during initialization
    clf = CardioNaiveBayesClassifier(
        test_size=0.2,
        random_state=42,
        var_smoothing=1e-9,
        scale_features=True,
        target_col='cardio',  # <--- Parameter moved here
        drop_cols=['id']      # <--- Parameter moved here
    )
    
    # Run the pipeline with just the filepaths
    metrics = clf.run_full_pipeline(
        input_filepath='train.csv',
        output_filepath='cardio_train_pred_nb.csv' # Changed output name
    )


Starting full pipeline...

Data loaded successfully: 68520 rows, 13 columns
Data preprocessed: Train size = 54816, Test size = 13704
Model trained successfully!

MODEL PERFORMANCE METRICS
Accuracy:  0.7072
Precision: 0.7501
Recall:    0.6114
F1-Score:  0.6737

Confusion Matrix:
[[5549 1380]
 [2633 4142]]

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.68      0.80      0.73      6929
     Disease       0.75      0.61      0.67      6775

    accuracy                           0.71     13704
   macro avg       0.71      0.71      0.70     13704
weighted avg       0.71      0.71      0.70     13704


Predictions exported to: cardio_train_pred_nb.csv
Pipeline completed successfully!


# Gradient Boosting


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


class CardioGradientBoostingClassifier:
    """
    A reusable Gradient Boosting classifier for cardiovascular disease prediction.
    
    Features:
    - Train/test split with customizable ratio
    - Optional feature scaling
    - Comprehensive performance metrics
    - Export predictions to CSV
    """
    
    def __init__(self, test_size=0.2, random_state=42, n_estimators=100, 
                 learning_rate=0.1, max_depth=3, scale_features=True,
                 target_col='cardio', drop_cols=None):
        """
        Initialize the classifier.
        
        Parameters:
        -----------
        test_size : float, default=0.2
            Proportion of dataset to include in test split
        random_state : int, default=42
            Random state for reproducibility
        n_estimators : int, default=100
            The number of boosting stages to perform
        learning_rate : float, default=0.1
            Learning rate shrinks the contribution of each tree by learning_rate
        max_depth : int, default=3
            Maximum depth of the individual regression estimators
        scale_features : bool, default=True
            Whether to scale features
        target_col : str, default='cardio'
            Name of the target variable column
        drop_cols : list, default=None
            List of columns to drop (e.g., ID columns)
        """

        self.test_size = test_size
        self.random_state = random_state
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.scale_features = scale_features
        self.target_col = target_col
        self.drop_cols = drop_cols if drop_cols else []
        
        self.model = None
        self.scaler = None
        self.data = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.feature_names = None
        self.metrics = {}
        
    def load_data(self, filepath):
        """
        Load data from CSV file.
        
        Parameters:
        -----------
        filepath : str
            Path to the CSV file
            
        Returns:
        --------
        pd.DataFrame : Loaded dataframe
        """
        self.data = pd.read_csv(filepath)
        print(f"Data loaded successfully: {self.data.shape[0]} rows, {self.data.shape[1]} columns")
        return self.data
    
    def preprocess_data(self):
        """
        Preprocess the data for training.
        Uses `self.target_col` and `self.drop_cols` from initialization.
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
        
        df = self.data.copy()
        if self.drop_cols:
            df = df.drop(columns=self.drop_cols, errors='ignore')
        
        X = df.drop(columns=[self.target_col])
        y = df[self.target_col]
        
        self.feature_names = X.columns.tolist()
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        
        if self.scale_features:
            self.scaler = StandardScaler()
            self.X_train = pd.DataFrame(
                self.scaler.fit_transform(self.X_train),
                columns=self.feature_names,
                index=self.X_train.index
            )
            self.X_test = pd.DataFrame(
                self.scaler.transform(self.X_test),
                columns=self.feature_names,
                index=self.X_test.index
            )
        
        print(f"Data preprocessed: Train size = {len(self.X_train)}, Test size = {len(self.X_test)}")
    
    def train(self):
        """Train the Gradient Boosting classifier."""
        if self.X_train is None:
            raise ValueError("Data not preprocessed. Call preprocess_data() first.")
        
        self.model = GradientBoostingClassifier(
            n_estimators=self.n_estimators,
            learning_rate=self.learning_rate,
            max_depth=self.max_depth,
            random_state=self.random_state
        )
        
        self.model.fit(self.X_train, self.y_train)
        print("Model trained successfully!")
    
    def predict(self, X=None):
        """
        Make predictions on data.
        
        Parameters:
        -----------
        X : pd.DataFrame, default=None
            Data to predict. If None, uses test set.
            
        Returns:
        --------
        np.ndarray : Predictions
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        if X is None:
            X_to_predict = self.X_test
        else:
            X_to_predict = X

        if X is not None and self.scale_features and self.scaler is not None:
            # When predicting on new, external data
            X_to_predict = pd.DataFrame(
                self.scaler.transform(X_to_predict),
                columns=X_to_predict.columns,
                index=X_to_predict.index
            )
        
        return self.model.predict(X_to_predict)
    
    def evaluate(self):
        """
        Evaluate model performance on test set.
        
        Returns:
        --------
        dict : Dictionary containing all performance metrics
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        y_pred = self.predict()
        
        self.metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred, average='binary'),
            'recall': recall_score(self.y_test, y_pred, average='binary'),
            'f1_score': f1_score(self.y_test, y_pred, average='binary'),
            'confusion_matrix': confusion_matrix(self.y_test, y_pred)
        }
        
        print("\n" + "="*50)
        print("MODEL PERFORMANCE METRICS")
        print("="*50)
        print(f"Accuracy:  {self.metrics['accuracy']:.4f}")
        print(f"Precision: {self.metrics['precision']:.4f}")
        print(f"Recall:    {self.metrics['recall']:.4f}")
        print(f"F1-Score:  {self.metrics['f1_score']:.4f}")
        print("\nConfusion Matrix:")
        print(self.metrics['confusion_matrix'])
        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred, 
                                      target_names=['No Disease', 'Disease']))
        print("="*50 + "\n")
        
        return self.metrics
    
    def export_predictions(self, output_filepath, include_all_data=True):
        """
        Export predictions for the *entire* loaded dataset to a new CSV file.
        
        Parameters:
        -----------
        output_filepath : str
            Path for the output CSV file
        include_all_data : bool, default=True
            If True, includes all original columns. If False, only includes
            the target and predicted columns.
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
            
        
        # Prepare the full dataset for prediction
        X_full = self.data.copy()
        
        if self.drop_cols:
            X_full = X_full.drop(columns=self.drop_cols, errors='ignore')
            
        X_full = X_full.drop(columns=[self.target_col], errors='ignore')
        
        # Ensure columns match training data
        X_full = X_full[self.feature_names]
        
        if self.scale_features and self.scaler is not None:
            X_full_scaled = pd.DataFrame(
                self.scaler.transform(X_full),
                columns=self.feature_names,
                index=X_full.index
            )
            predictions = self.model.predict(X_full_scaled)
        else:
            predictions = self.model.predict(X_full)
        
        if include_all_data:
            output_df = self.data.copy()
        else:
            output_df = pd.DataFrame()
            output_df[self.target_col] = self.data[self.target_col]
        
        output_df['predicted'] = predictions
        
        output_df.to_csv(output_filepath, index=False)
        print(f"Predictions exported to: {output_filepath}")
        
        return output_df
    
    def run_full_pipeline(self, input_filepath, output_filepath, include_all_data=True):
        """
        Run the complete pipeline: load, preprocess, train, evaluate, and export.
        
        Parameters:
        -----------
        input_filepath : str
            Path to input CSV file
        output_filepath : str
            Path for output CSV file with predictions
        include_all_data : bool, default=True
            Passed to export_predictions to control output file content.
            
        Returns:
        --------
        dict : Performance metrics
        """
        print("Starting full pipeline...\n")
        
        self.load_data(input_filepath)
        self.preprocess_data()
        self.train()
        metrics = self.evaluate()
        self.export_predictions(output_filepath, include_all_data=include_all_data)
        
        print("Pipeline completed successfully!")
        return metrics


# Example usage
if __name__ == "__main__":
    # Configure the model with all parameters during initialization
    clf = CardioGradientBoostingClassifier(
        test_size=0.2,
        random_state=42,
        n_estimators=1000,
        learning_rate=0.1,
        max_depth=100,
        scale_features=True,
        target_col='cardio',  # <--- Parameter moved here
        drop_cols=['id']      # <--- Parameter moved here
    )
    
    # Run the pipeline with just the filepaths
    metrics = clf.run_full_pipeline(
        input_filepath='train.csv',
        output_filepath='cardio_train_pred_gb.csv' # Changed output name
    )


Starting full pipeline...

Data loaded successfully: 68520 rows, 13 columns
Data preprocessed: Train size = 54816, Test size = 13704


# SVM 


In [12]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


class CardioSupportVectorClassifier:
    """
    A reusable Support Vector Machine classifier for cardiovascular disease prediction.
    
    Features:
    - Train/test split with customizable ratio
    - Optional feature scaling
    - Comprehensive performance metrics
    - Export predictions to CSV
    """
    
    def __init__(self, test_size=0.2, random_state=42, C=1.0, 
                 kernel='rbf', gamma='scale', scale_features=True):
        """
        Initialize the classifier.
        
        Parameters:
        -----------
        test_size : float, default=0.2
            Proportion of dataset to include in test split
        random_state : int, default=42
            Random state for reproducibility
        C : float, default=1.0
            Regularization parameter. The strength of the regularization is inversely proportional to C
        kernel : str, default='rbf'
            Specifies the kernel type to be used in the algorithm
        gamma : float or str, default='scale'
            Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’
        scale_features : bool, default=True
            Whether to scale features
        """

        self.test_size = test_size
        self.random_state = random_state
        self.C = C
        self.kernel = kernel
        self.gamma = gamma
        self.scale_features = scale_features
        
        self.model = None
        self.scaler = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.feature_names = None
        self.metrics = {}
        self.drop_cols = None
        
    def load_data(self, filepath):
        """
        Load data from CSV file.
        
        Parameters:
        -----------
        filepath : str
            Path to the CSV file
            
        Returns:
        --------
        pd.DataFrame : Loaded dataframe
        """
        self.data = pd.read_csv(filepath)
        print(f"Data loaded successfully: {self.data.shape[0]} rows, {self.data.shape[1]} columns")
        return self.data
    
    def preprocess_data(self, target_col='cardio', drop_cols=None):
        """
        Preprocess the data for training.
        
        Parameters:
        -----------
        target_col : str, default='cardio'
            Name of the target variable column
        drop_cols : list, default=None
            List of columns to drop (e.g., ID columns)
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
        
        self.drop_cols = drop_cols if drop_cols else []
        
        df = self.data.copy()
        if drop_cols:
            df = df.drop(columns=drop_cols, errors='ignore')
        
        X = df.drop(columns=[target_col])
        y = df[target_col]
        
        self.feature_names = X.columns.tolist()
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        
        if self.scale_features:
            self.scaler = StandardScaler()
            self.X_train = pd.DataFrame(
                self.scaler.fit_transform(self.X_train),
                columns=self.feature_names,
                index=self.X_train.index
            )
            self.X_test = pd.DataFrame(
                self.scaler.transform(self.X_test),
                columns=self.feature_names,
                index=self.X_test.index
            )
        
        print(f"Data preprocessed: Train size = {len(self.X_train)}, Test size = {len(self.X_test)}")
    
    def train(self):
        """Train the Support Vector Machine classifier."""
        if self.X_train is None:
            raise ValueError("Data not preprocessed. Call preprocess_data() first.")
        
        self.model = SVC(
            C=self.C,
            kernel=self.kernel,
            gamma=self.gamma,
            random_state=self.random_state
        )
        
        self.model.fit(self.X_train, self.y_train)
        print("Model trained successfully!")
    
    def predict(self, X=None):
        """
        Make predictions on data.
        
        Parameters:
        -----------
        X : pd.DataFrame, default=None
            Data to predict. If None, uses test set.
            
        Returns:
        --------
        np.ndarray : Predictions
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        if X is None:
            X = self.X_test
        elif self.scale_features and self.scaler is not None:
            X = pd.DataFrame(
                self.scaler.transform(X),
                columns=X.columns,
                index=X.index
            )
        
        return self.model.predict(X)
    
    def evaluate(self):
        """
        Evaluate model performance on test set.
        
        Returns:
        --------
        dict : Dictionary containing all performance metrics
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        y_pred = self.predict()
        
        self.metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred, average='binary'),
            'recall': recall_score(self.y_test, y_pred, average='binary'),
            'f1_score': f1_score(self.y_test, y_pred, average='binary'),
            'confusion_matrix': confusion_matrix(self.y_test, y_pred)
        }
        
        print("\n" + "="*50)
        print("MODEL PERFORMANCE METRICS")
        print("="*50)
        print(f"Accuracy:  {self.metrics['accuracy']:.4f}")
        print(f"Precision: {self.metrics['precision']:.4f}")
        print(f"Recall:    {self.metrics['recall']:.4f}")
        print(f"F1-Score:  {self.metrics['f1_score']:.4f}")
        print("\nConfusion Matrix:")
        print(self.metrics['confusion_matrix'])
        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred, 
                                   target_names=['No Disease', 'Disease']))
        print("="*50 + "\n")
        
        return self.metrics
    
    def export_predictions(self, output_filepath, include_all_data=True):
        """
        Export predictions to a new CSV file.
        
        Parameters:
        -----------
        output_filepath : str
            Path for the output CSV file
        include_all_data : bool, default=True
            If True, includes all original columns. If False, only includes
            the target and predicted columns.
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        X_full = self.data.copy()
        
        if self.drop_cols:
            X_full = X_full.drop(columns=self.drop_cols, errors='ignore')
        
        X_full = X_full.drop(columns=['cardio'], errors='ignore')
        
        if self.scale_features and self.scaler is not None:
            X_full_scaled = pd.DataFrame(
                self.scaler.transform(X_full),
                columns=self.feature_names,
                index=X_full.index
            )
            predictions = self.model.predict(X_full_scaled)
        else:
            predictions = self.model.predict(X_full)
        
        if include_all_data:
            output_df = self.data.copy()
        else:
            output_df = pd.DataFrame()
            output_df['cardio'] = self.data['cardio']
        
        output_df['predicted'] = predictions
        
        output_df.to_csv(output_filepath, index=False)
        print(f"Predictions exported to: {output_filepath}")
        
        return output_df
    
    def run_full_pipeline(self, input_filepath, output_filepath, 
                         target_col='cardio', drop_cols=None):
        """
        Run the complete pipeline: load, preprocess, train, evaluate, and export.
        
        Parameters:
        -----------
        input_filepath : str
            Path to input CSV file
        output_filepath : str
            Path for output CSV file with predictions
        target_col : str, default='cardio'
            Name of target variable
        drop_cols : list, default=None
            Columns to drop before training
            
        Returns:
        --------
        dict : Performance metrics
        """
        print("Starting full pipeline...\n")
        
        self.load_data(input_filepath)
        self.preprocess_data(target_col=target_col, drop_cols=drop_cols)
        self.train()
        metrics = self.evaluate()
        self.export_predictions(output_filepath)
        
        print("Pipeline completed successfully!")
        return metrics


# Example usage
if __name__ == "__main__":
    clf = CardioSupportVectorClassifier(
        test_size=0.2,
        random_state=42,
        C=1.0,
        kernel='rbf',
        gamma='scale',
        scale_features=True
    )
    
    metrics = clf.run_full_pipeline(
        input_filepath='cardio_train.csv',
        output_filepath='cardio_train_pred.csv',
        drop_cols=['id'] 
    )

Starting full pipeline...

Data loaded successfully: 70000 rows, 13 columns
Data preprocessed: Train size = 56000, Test size = 14000
Model trained successfully!

MODEL PERFORMANCE METRICS
Accuracy:  0.7248
Precision: 0.7394
Recall:    0.6938
F1-Score:  0.7159

Confusion Matrix:
[[5293 1711]
 [2142 4854]]

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.71      0.76      0.73      7004
     Disease       0.74      0.69      0.72      6996

    accuracy                           0.72     14000
   macro avg       0.73      0.72      0.72     14000
weighted avg       0.73      0.72      0.72     14000


Predictions exported to: cardio_train_pred.csv
Pipeline completed successfully!


# KNN

In [14]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


class CardioKNeighborsClassifier:
    """
    A reusable K-Nearest Neighbors classifier for cardiovascular disease prediction.
    
    Features:
    - Train/test split with customizable ratio
    - Optional feature scaling
    - Comprehensive performance metrics
    - Export predictions to CSV
    """
    
    def __init__(self, test_size=0.2, random_state=42, n_neighbors=5, 
                 weights='uniform', metric='euclidean', scale_features=True):
        """
        Initialize the classifier.
        
        Parameters:
        -----------
        test_size : float, default=0.2
            Proportion of dataset to include in test split
        random_state : int, default=42
            Random state for reproducibility
        n_neighbors : int, default=5
            Number of neighbors to use by default for kneighbors queries
        weights : str, default='uniform'
            Weight function used in prediction
        metric : str, default='euclidean'
            Distance metric to use for the tree
        scale_features : bool, default=True
            Whether to scale features
        """

        self.test_size = test_size
        self.random_state = random_state
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.metric = metric
        self.scale_features = scale_features
        
        self.model = None
        self.scaler = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.feature_names = None
        self.metrics = {}
        self.drop_cols = None
        
    def load_data(self, filepath):
        """
        Load data from CSV file.
        
        Parameters:
        -----------
        filepath : str
            Path to the CSV file
            
        Returns:
        --------
        pd.DataFrame : Loaded dataframe
        """
        self.data = pd.read_csv(filepath)
        print(f"Data loaded successfully: {self.data.shape[0]} rows, {self.data.shape[1]} columns")
        return self.data
    
    def preprocess_data(self, target_col='cardio', drop_cols=None):
        """
        Preprocess the data for training.
        
        Parameters:
        -----------
        target_col : str, default='cardio'
            Name of the target variable column
        drop_cols : list, default=None
            List of columns to drop (e.g., ID columns)
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
        
        self.drop_cols = drop_cols if drop_cols else []
        
        df = self.data.copy()
        if drop_cols:
            df = df.drop(columns=drop_cols, errors='ignore')
        
        X = df.drop(columns=[target_col])
        y = df[target_col]
        
        self.feature_names = X.columns.tolist()
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        
        if self.scale_features:
            self.scaler = StandardScaler()
            self.X_train = pd.DataFrame(
                self.scaler.fit_transform(self.X_train),
                columns=self.feature_names,
                index=self.X_train.index
            )
            self.X_test = pd.DataFrame(
                self.scaler.transform(self.X_test),
                columns=self.feature_names,
                index=self.X_test.index
            )
        
        print(f"Data preprocessed: Train size = {len(self.X_train)}, Test size = {len(self.X_test)}")
    
    def train(self):
        """Train the K-Nearest Neighbors classifier."""
        if self.X_train is None:
            raise ValueError("Data not preprocessed. Call preprocess_data() first.")
        
        self.model = KNeighborsClassifier(
            n_neighbors=self.n_neighbors,
            weights=self.weights,
            metric=self.metric
        )
        
        self.model.fit(self.X_train, self.y_train)
        print("Model trained successfully!")
    
    def predict(self, X=None):
        """
        Make predictions on data.
        
        Parameters:
        -----------
        X : pd.DataFrame, default=None
            Data to predict. If None, uses test set.
            
        Returns:
        --------
        np.ndarray : Predictions
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        if X is None:
            X = self.X_test
        elif self.scale_features and self.scaler is not None:
            X = pd.DataFrame(
                self.scaler.transform(X),
                columns=X.columns,
                index=X.index
            )
        
        return self.model.predict(X)
    
    def evaluate(self):
        """
        Evaluate model performance on test set.
        
        Returns:
        --------
        dict : Dictionary containing all performance metrics
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        y_pred = self.predict()
        
        self.metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred, average='binary'),
            'recall': recall_score(self.y_test, y_pred, average='binary'),
            'f1_score': f1_score(self.y_test, y_pred, average='binary'),
            'confusion_matrix': confusion_matrix(self.y_test, y_pred)
        }
        
        print("\n" + "="*50)
        print("MODEL PERFORMANCE METRICS")
        print("="*50)
        print(f"Accuracy:  {self.metrics['accuracy']:.4f}")
        print(f"Precision: {self.metrics['precision']:.4f}")
        print(f"Recall:    {self.metrics['recall']:.4f}")
        print(f"F1-Score:  {self.metrics['f1_score']:.4f}")
        print("\nConfusion Matrix:")
        print(self.metrics['confusion_matrix'])
        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred, 
                                   target_names=['No Disease', 'Disease']))
        print("="*50 + "\n")
        
        return self.metrics
    
    def export_predictions(self, output_filepath, include_all_data=True):
        """
        Export predictions to a new CSV file.
        
        Parameters:
        -----------
        output_filepath : str
            Path for the output CSV file
        include_all_data : bool, default=True
            If True, includes all original columns. If False, only includes
            the target and predicted columns.
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        X_full = self.data.copy()
        
        if self.drop_cols:
            X_full = X_full.drop(columns=self.drop_cols, errors='ignore')
        
        X_full = X_full.drop(columns=['cardio'], errors='ignore')
        
        if self.scale_features and self.scaler is not None:
            X_full_scaled = pd.DataFrame(
                self.scaler.transform(X_full),
                columns=self.feature_names,
                index=X_full.index
            )
            predictions = self.model.predict(X_full_scaled)
        else:
            predictions = self.model.predict(X_full)
        
        if include_all_data:
            output_df = self.data.copy()
        else:
            output_df = pd.DataFrame()
            output_df['cardio'] = self.data['cardio']
        
        output_df['predicted'] = predictions
        
        output_df.to_csv(output_filepath, index=False)
        print(f"Predictions exported to: {output_filepath}")
        
        return output_df
    
    def run_full_pipeline(self, input_filepath, output_filepath, 
                         target_col='cardio', drop_cols=None):
        """
        Run the complete pipeline: load, preprocess, train, evaluate, and export.
        
        Parameters:
        -----------
        input_filepath : str
            Path to input CSV file
        output_filepath : str
            Path for output CSV file with predictions
        target_col : str, default='cardio'
            Name of target variable
        drop_cols : list, default=None
            Columns to drop before training
            
        Returns:
        --------
        dict : Performance metrics
        """
        print("Starting full pipeline...\n")
        
        self.load_data(input_filepath)
        self.preprocess_data(target_col=target_col, drop_cols=drop_cols)
        self.train()
        metrics = self.evaluate()
        self.export_predictions(output_filepath)
        
        print("Pipeline completed successfully!")
        return metrics


# Example usage
if __name__ == "__main__":
    clf = CardioKNeighborsClassifier(
        test_size=0.2,
        random_state=42,
        n_neighbors=3,
        weights='uniform',
        metric='euclidean',
        scale_features=True
    )
    
    metrics = clf.run_full_pipeline(
        input_filepath='cardio_train.csv',
        output_filepath='cardio_train_predknn.csv',
        drop_cols=['id'] 
    )

Starting full pipeline...

Data loaded successfully: 70000 rows, 13 columns
Data preprocessed: Train size = 56000, Test size = 14000
Model trained successfully!

MODEL PERFORMANCE METRICS
Accuracy:  0.6325
Precision: 0.6363
Recall:    0.6176
F1-Score:  0.6268

Confusion Matrix:
[[4534 2470]
 [2675 4321]]

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.63      0.65      0.64      7004
     Disease       0.64      0.62      0.63      6996

    accuracy                           0.63     14000
   macro avg       0.63      0.63      0.63     14000
weighted avg       0.63      0.63      0.63     14000


Predictions exported to: cardio_train_predknn.csv
Pipeline completed successfully!


# DL Algorithms


In [18]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


class CardioMLPClassifier:
    """
    A reusable Multi-Layer Perceptron (MLP) deep learning classifier for cardiovascular disease prediction using PyTorch.
    
    Features:
    - Train/test split with customizable ratio
    - Optional feature scaling
    - Comprehensive performance metrics
    - Export predictions to CSV
    """
    
    def __init__(self, test_size=0.2, random_state=42, hidden_sizes=[64, 32], 
                 lr=0.001, epochs=100, batch_size=32, scale_features=True):
        """
        Initialize the classifier.
        
        Parameters:
        -----------
        test_size : float, default=0.2
            Proportion of dataset to include in test split
        random_state : int, default=42
            Random state for reproducibility
        hidden_sizes : list, default=[64, 32]
            List of hidden layer sizes
        lr : float, default=0.001
            Learning rate for the optimizer
        epochs : int, default=100
            Number of training epochs
        batch_size : int, default=32
            Batch size for training
        scale_features : bool, default=True
            Whether to scale features
        """

        self.test_size = test_size
        self.random_state = random_state
        self.hidden_sizes = hidden_sizes
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.scale_features = scale_features
        
        self.model = None
        self.scaler = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.X_train_tensor = None
        self.y_train_tensor = None
        self.feature_names = None
        self.metrics = {}
        self.drop_cols = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    def load_data(self, filepath):
        """
        Load data from CSV file.
        
        Parameters:
        -----------
        filepath : str
            Path to the CSV file
            
        Returns:
        --------
        pd.DataFrame : Loaded dataframe
        """
        self.data = pd.read_csv(filepath)
        print(f"Data loaded successfully: {self.data.shape[0]} rows, {self.data.shape[1]} columns")
        return self.data
    
    def preprocess_data(self, target_col='cardio', drop_cols=None):
        """
        Preprocess the data for training.
        
        Parameters:
        -----------
        target_col : str, default='cardio'
            Name of the target variable column
        drop_cols : list, default=None
            List of columns to drop (e.g., ID columns)
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
        
        self.drop_cols = drop_cols if drop_cols else []
        
        df = self.data.copy()
        if drop_cols:
            df = df.drop(columns=drop_cols, errors='ignore')
        
        X = df.drop(columns=[target_col])
        y = df[target_col]
        
        self.feature_names = X.columns.tolist()
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        
        if self.scale_features:
            self.scaler = StandardScaler()
            self.X_train = pd.DataFrame(
                self.scaler.fit_transform(self.X_train),
                columns=self.feature_names,
                index=self.X_train.index
            )
            self.X_test = pd.DataFrame(
                self.scaler.transform(self.X_test),
                columns=self.feature_names,
                index=self.X_test.index
            )
        
        print(f"Data preprocessed: Train size = {len(self.X_train)}, Test size = {len(self.X_test)}")
    
    def train(self):
        """Train the MLP classifier."""
        if self.X_train is None:
            raise ValueError("Data not preprocessed. Call preprocess_data() first.")
        
        # Convert to tensors
        X_train_tensor = torch.FloatTensor(self.X_train.values).to(self.device)
        y_train_tensor = torch.LongTensor(self.y_train.values).to(self.device)
        
        self.X_train_tensor = X_train_tensor
        self.y_train_tensor = y_train_tensor
        
        # Define model
        input_size = len(self.feature_names)
        layers = []
        prev_size = input_size
        for size in self.hidden_sizes:
            layers.append(nn.Linear(prev_size, size))
            layers.append(nn.ReLU())
            prev_size = size
        layers.append(nn.Linear(prev_size, 2))  # Binary classification
        
        self.model = nn.Sequential(*layers).to(self.device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        
        # DataLoader
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        
        self.model.train()
        for epoch in range(self.epochs):
            running_loss = 0.0
            for batch_x, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = self.model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            
            if (epoch + 1) % 50 == 0:
                avg_loss = running_loss / len(train_loader)
                print(f'Epoch [{epoch+1}/{self.epochs}], Loss: {avg_loss:.4f}')
        
        print("Model trained successfully!")
    
    def predict(self, X=None):
        """
        Make predictions on data.
        
        Parameters:
        -----------
        X : pd.DataFrame, default=None
            Data to predict. If None, uses test set.
            
        Returns:
        --------
        np.ndarray : Predictions
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        self.model.eval()
        with torch.no_grad():
            if X is None:
                input_tensor = torch.FloatTensor(self.X_test.values).to(self.device)
            else:
                if self.scale_features and self.scaler is not None:
                    X_scaled = pd.DataFrame(
                        self.scaler.transform(X),
                        columns=X.columns,
                        index=X.index
                    )
                    input_tensor = torch.FloatTensor(X_scaled.values).to(self.device)
                else:
                    input_tensor = torch.FloatTensor(X.values).to(self.device)
            
            outputs = self.model(input_tensor)
            _, predicted = torch.max(outputs.data, 1)
            return predicted.cpu().numpy()
    
    def evaluate(self):
        """
        Evaluate model performance on test set.
        
        Returns:
        --------
        dict : Dictionary containing all performance metrics
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        y_pred = self.predict()
        
        self.metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred, average='binary'),
            'recall': recall_score(self.y_test, y_pred, average='binary'),
            'f1_score': f1_score(self.y_test, y_pred, average='binary'),
            'confusion_matrix': confusion_matrix(self.y_test, y_pred)
        }
        
        print("\n" + "="*50)
        print("MODEL PERFORMANCE METRICS")
        print("="*50)
        print(f"Accuracy:  {self.metrics['accuracy']:.4f}")
        print(f"Precision: {self.metrics['precision']:.4f}")
        print(f"Recall:    {self.metrics['recall']:.4f}")
        print(f"F1-Score:  {self.metrics['f1_score']:.4f}")
        print("\nConfusion Matrix:")
        print(self.metrics['confusion_matrix'])
        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred, 
                                   target_names=['No Disease', 'Disease']))
        print("="*50 + "\n")
        
        return self.metrics
    
    def export_predictions(self, output_filepath, include_all_data=True):
        """
        Export predictions to a new CSV file.
        
        Parameters:
        -----------
        output_filepath : str
            Path for the output CSV file
        include_all_data : bool, default=True
            If True, includes all original columns. If False, only includes
            the target and predicted columns.
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        X_full = self.data.copy()
        
        if self.drop_cols:
            X_full = X_full.drop(columns=self.drop_cols, errors='ignore')
        
        X_full = X_full.drop(columns=['cardio'], errors='ignore')
        
        if self.scale_features and self.scaler is not None:
            X_full_scaled = pd.DataFrame(
                self.scaler.transform(X_full),
                columns=self.feature_names,
                index=X_full.index
            )
            predictions = self.predict(X_full_scaled)
        else:
            predictions = self.predict(X_full)
        
        if include_all_data:
            output_df = self.data.copy()
        else:
            output_df = pd.DataFrame()
            output_df['cardio'] = self.data['cardio']
        
        output_df['predicted'] = predictions
        
        output_df.to_csv(output_filepath, index=False)
        print(f"Predictions exported to: {output_filepath}")
        
        return output_df
    
    def run_full_pipeline(self, input_filepath, output_filepath, 
                         target_col='cardio', drop_cols=None):
        """
        Run the complete pipeline: load, preprocess, train, evaluate, and export.
        
        Parameters:
        -----------
        input_filepath : str
            Path to input CSV file
        output_filepath : str
            Path for output CSV file with predictions
        target_col : str, default='cardio'
            Name of target variable
        drop_cols : list, default=None
            Columns to drop before training
            
        Returns:
        --------
        dict : Performance metrics
        """
        print("Starting full pipeline...\n")
        
        self.load_data(input_filepath)
        self.preprocess_data(target_col=target_col, drop_cols=drop_cols)
        self.train()
        metrics = self.evaluate()
        self.export_predictions(output_filepath)
        
        print("Pipeline completed successfully!")
        return metrics


class CardioTabularCNNClassifier:
    """
    A reusable 1D Convolutional Neural Network (CNN) classifier for tabular cardiovascular disease prediction using PyTorch.
    Treats features as a 1D sequence.
    
    Features:
    - Train/test split with customizable ratio
    - Optional feature scaling
    - Comprehensive performance metrics
    - Export predictions to CSV
    """
    
    def __init__(self, test_size=0.2, random_state=42, filters=[32, 64], 
                 kernel_sizes=[3, 3], lr=0.001, epochs=100, batch_size=32, 
                 scale_features=True):
        """
        Initialize the classifier.
        
        Parameters:
        -----------
        test_size : float, default=0.2
            Proportion of dataset to include in test split
        random_state : int, default=42
            Random state for reproducibility
        filters : list, default=[32, 64]
            List of number of filters for each conv layer
        kernel_sizes : list, default=[3, 3]
            List of kernel sizes for each conv layer
        lr : float, default=0.001
            Learning rate for the optimizer
        epochs : int, default=100
            Number of training epochs
        batch_size : int, default=32
            Batch size for training
        scale_features : bool, default=True
            Whether to scale features
        """

        self.test_size = test_size
        self.random_state = random_state
        self.filters = filters
        self.kernel_sizes = kernel_sizes
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.scale_features = scale_features
        
        self.model = None
        self.scaler = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.X_train_tensor = None
        self.y_train_tensor = None
        self.feature_names = None
        self.metrics = {}
        self.drop_cols = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    def load_data(self, filepath):
        """
        Load data from CSV file.
        
        Parameters:
        -----------
        filepath : str
            Path to the CSV file
            
        Returns:
        --------
        pd.DataFrame : Loaded dataframe
        """
        self.data = pd.read_csv(filepath)
        print(f"Data loaded successfully: {self.data.shape[0]} rows, {self.data.shape[1]} columns")
        return self.data
    
    def preprocess_data(self, target_col='cardio', drop_cols=None):
        """
        Preprocess the data for training.
        
        Parameters:
        -----------
        target_col : str, default='cardio'
            Name of the target variable column
        drop_cols : list, default=None
            List of columns to drop (e.g., ID columns)
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
        
        self.drop_cols = drop_cols if drop_cols else []
        
        df = self.data.copy()
        if drop_cols:
            df = df.drop(columns=drop_cols, errors='ignore')
        
        X = df.drop(columns=[target_col])
        y = df[target_col]
        
        self.feature_names = X.columns.tolist()
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        
        if self.scale_features:
            self.scaler = StandardScaler()
            self.X_train = pd.DataFrame(
                self.scaler.fit_transform(self.X_train),
                columns=self.feature_names,
                index=self.X_train.index
            )
            self.X_test = pd.DataFrame(
                self.scaler.transform(self.X_test),
                columns=self.feature_names,
                index=self.X_test.index
            )
        
        print(f"Data preprocessed: Train size = {len(self.X_train)}, Test size = {len(self.X_test)}")
    
    def train(self):
        """Train the Tabular CNN classifier."""
        if self.X_train is None:
            raise ValueError("Data not preprocessed. Call preprocess_data() first.")
        
        # Convert to tensors with channel dimension for Conv1D
        X_train_tensor = torch.FloatTensor(self.X_train.values).unsqueeze(1).to(self.device)  # (N, 1, features)
        y_train_tensor = torch.LongTensor(self.y_train.values).to(self.device)
        
        self.X_train_tensor = X_train_tensor
        self.y_train_tensor = y_train_tensor
        
        # Define model
        input_size = len(self.feature_names)
        conv_layers = []
        prev_channels = 1
        prev_size = input_size
        for f, k in zip(self.filters, self.kernel_sizes):
            conv_layers.append(nn.Conv1d(prev_channels, f, kernel_size=k, padding=(k-1)//2))
            conv_layers.append(nn.ReLU())
            conv_layers.append(nn.MaxPool1d(2))
            prev_channels = f
            prev_size = prev_size // 2  # Approximate after pooling
        
        # Flatten size calculation (approximate)
        flatten_size = prev_channels * (input_size // 4)  # Assuming two poolings halve twice
        
        self.model = nn.Sequential(
            *conv_layers,
            nn.Flatten(),
            nn.Linear(flatten_size, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        ).to(self.device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        
        # DataLoader
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        
        self.model.train()
        for epoch in range(self.epochs):
            running_loss = 0.0
            for batch_x, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = self.model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            
            if (epoch + 1) % 50 == 0:
                avg_loss = running_loss / len(train_loader)
                print(f'Epoch [{epoch+1}/{self.epochs}], Loss: {avg_loss:.4f}')
        
        print("Model trained successfully!")
    
    def predict(self, X=None):
        """
        Make predictions on data.
        
        Parameters:
        -----------
        X : pd.DataFrame, default=None
            Data to predict. If None, uses test set.
            
        Returns:
        --------
        np.ndarray : Predictions
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        self.model.eval()
        with torch.no_grad():
            if X is None:
                input_tensor = torch.FloatTensor(self.X_test.values).unsqueeze(1).to(self.device)
            else:
                if self.scale_features and self.scaler is not None:
                    X_scaled = pd.DataFrame(
                        self.scaler.transform(X),
                        columns=X.columns,
                        index=X.index
                    )
                    input_tensor = torch.FloatTensor(X_scaled.values).unsqueeze(1).to(self.device)
                else:
                    input_tensor = torch.FloatTensor(X.values).unsqueeze(1).to(self.device)
            
            outputs = self.model(input_tensor)
            _, predicted = torch.max(outputs.data, 1)
            return predicted.cpu().numpy()
    
    def evaluate(self):
        """
        Evaluate model performance on test set.
        
        Returns:
        --------
        dict : Dictionary containing all performance metrics
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        y_pred = self.predict()
        
        self.metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred, average='binary'),
            'recall': recall_score(self.y_test, y_pred, average='binary'),
            'f1_score': f1_score(self.y_test, y_pred, average='binary'),
            'confusion_matrix': confusion_matrix(self.y_test, y_pred)
        }
        
        print("\n" + "="*50)
        print("MODEL PERFORMANCE METRICS")
        print("="*50)
        print(f"Accuracy:  {self.metrics['accuracy']:.4f}")
        print(f"Precision: {self.metrics['precision']:.4f}")
        print(f"Recall:    {self.metrics['recall']:.4f}")
        print(f"F1-Score:  {self.metrics['f1_score']:.4f}")
        print("\nConfusion Matrix:")
        print(self.metrics['confusion_matrix'])
        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred, 
                                   target_names=['No Disease', 'Disease']))
        print("="*50 + "\n")
        
        return self.metrics
    
    def export_predictions(self, output_filepath, include_all_data=True):
        """
        Export predictions to a new CSV file.
        
        Parameters:
        -----------
        output_filepath : str
            Path for the output CSV file
        include_all_data : bool, default=True
            If True, includes all original columns. If False, only includes
            the target and predicted columns.
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train() first.")
        
        X_full = self.data.copy()
        
        if self.drop_cols:
            X_full = X_full.drop(columns=self.drop_cols, errors='ignore')
        
        X_full = X_full.drop(columns=['cardio'], errors='ignore')
        
        if self.scale_features and self.scaler is not None:
            X_full_scaled = pd.DataFrame(
                self.scaler.transform(X_full),
                columns=self.feature_names,
                index=X_full.index
            )
            predictions = self.predict(X_full_scaled)
        else:
            predictions = self.predict(X_full)
        
        if include_all_data:
            output_df = self.data.copy()
        else:
            output_df = pd.DataFrame()
            output_df['cardio'] = self.data['cardio']
        
        output_df['predicted'] = predictions
        
        output_df.to_csv(output_filepath, index=False)
        print(f"Predictions exported to: {output_filepath}")
        
        return output_df
    
    def run_full_pipeline(self, input_filepath, output_filepath, 
                         target_col='cardio', drop_cols=None):
        """
        Run the complete pipeline: load, preprocess, train, evaluate, and export.
        
        Parameters:
        -----------
        input_filepath : str
            Path to input CSV file
        output_filepath : str
            Path for output CSV file with predictions
        target_col : str, default='cardio'
            Name of target variable
        drop_cols : list, default=None
            Columns to drop before training
            
        Returns:
        --------
        dict : Performance metrics
        """
        print("Starting full pipeline...\n")
        
        self.load_data(input_filepath)
        self.preprocess_data(target_col=target_col, drop_cols=drop_cols)
        self.train()
        metrics = self.evaluate()
        self.export_predictions(output_filepath)
        
        print("Pipeline completed successfully!")
        return metrics


# Example usage
if __name__ == "__main__":
    # MLP Example
    print("Running MLP Pipeline...")
    mlp_clf = CardioMLPClassifier(
        test_size=0.2,
        random_state=42,
        hidden_sizes=[64, 32],
        lr=0.001,
        epochs=100,
        batch_size=32,
        scale_features=True
    )
    
    mlp_metrics = mlp_clf.run_full_pipeline(
        input_filepath='cardio_train.csv',
        output_filepath='cardio_train_mlp_pred.csv',
        drop_cols=['id'] 
    )
    
    # CNN Example
    print("\nRunning Tabular CNN Pipeline...")
    cnn_clf = CardioTabularCNNClassifier(
        test_size=0.2,
        random_state=42,
        filters=[32, 64],
        kernel_sizes=[3, 3],
        lr=0.001,
        epochs=100,
        batch_size=32,
        scale_features=True
    )
    
    cnn_metrics = cnn_clf.run_full_pipeline(
        input_filepath='cardio_train.csv',
        output_filepath='cardio_train_cnn_pred.csv',
        drop_cols=['id'] 
    )

Looking in indexes: https://download.pytorch.org/whl/cu121
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement torch (from versions: none)
ERROR: No matching distribution found for torch


ModuleNotFoundError: No module named 'torch'