In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, roc_curve
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import pandas as pd


class DecisionTreeClassifierModel:
    def __init__(self, df: pd.DataFrame, target_column: str = "TenYearCHD", test_size: float = 0.2, random_state: int = 42, max_depth: int = 5, use_smote: bool = True):
        """
        Initializes the DecisionTreeClassifierModel.
        """
        self.df = df
        self.target_column = target_column
        self.test_size = test_size
        self.random_state = random_state
        self.max_depth = max_depth
        self.use_smote = use_smote
        self.model = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.y_pred = None
        self.y_proba = None
        self.evaluation_metrics = None
        self.roc_curve_data = None

    def split_data(self):
        """Splits the data into training and testing sets."""
        X = self.df.drop(columns=self.target_column)
        y = self.df[self.target_column]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, stratify=y, random_state=self.random_state
        )

    def apply_smote(self):
        """Applies SMOTE to the training data to handle class imbalance."""
        if self.use_smote:
            smote = SMOTE(random_state=self.random_state)
            self.X_train, self.y_train = smote.fit_resample(self.X_train, self.y_train)
            print("SMOTE applied to the training data.")
        else:
            print("SMOTE not applied.")

    def train_model(self):
        """Builds and trains the Decision Tree model."""
        self.model = DecisionTreeClassifier(max_depth=self.max_depth, random_state=self.random_state)
        self.model.fit(self.X_train, self.y_train)
        print("Decision Tree model trained.")

    def predict(self):
        """Makes predictions on the test set."""
        if self.model is None:
            raise ValueError("Model has not been trained yet. Call train_model() first.")
        self.y_pred = self.model.predict(self.X_test)
        self.y_proba = self.model.predict_proba(self.X_test)[:, 1]

    def evaluate_model(self):
        """Evaluates the performance of the trained model."""
        if self.y_pred is None or self.y_proba is None:
            raise ValueError("Model predictions have not been made yet. Call predict() first.")

        accuracy = accuracy_score(self.y_test, self.y_pred)
        precision = precision_score(self.y_test, self.y_pred)
        recall = recall_score(self.y_test, self.y_pred)
        f1 = f1_score(self.y_test, self.y_pred)
        auc = roc_auc_score(self.y_test, self.y_proba)

        self.evaluation_metrics = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "AUC-ROC": auc,
        }

        print("\nEvaluation Metrics (Decision Tree with SMOTE):")
        for metric, value in self.evaluation_metrics.items():
            print(f"{metric}: {value:.4f}")
        print("\nClassification Report (Decision Tree with SMOTE):")
        print(classification_report(self.y_test, self.y_pred))

    def plot_roc_curve(self):
        """Plots the Receiver Operating Characteristic (ROC) curve."""
        if self.y_proba is None:
            raise ValueError("Model probabilities have not been predicted yet. Call predict() first.")

        fpr, tpr, _ = roc_curve(self.y_test, self.y_proba)
        self.roc_curve_data = (fpr, tpr)

        plt.figure(figsize=(6, 4))
        plt.plot(fpr, tpr, label=f"AUC = {self.evaluation_metrics['AUC-ROC']:.2f}", color='darkorange')
        plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve - Decision Tree (with SMOTE)")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    def run_pipeline(self):
        """Runs the complete pipeline: split data, apply SMOTE (optional), train model, predict, evaluate, and plot ROC curve."""
        self.split_data()
        self.apply_smote()
        self.train_model()
        self.predict()
        self.evaluate_model()
        self.plot_roc_curve()

    def visualize_decision_tree(self, feature_names=None, class_names=None):
        """Visualizes the trained Decision Tree."""
        if self.model is None:
            raise ValueError("Model has not been trained yet. Call train_model() first.")

        plt.figure(figsize=(20, 10))
        plot_tree(
            self.model,
            feature_names=feature_names if feature_names else self.X_train.columns,
            class_names=class_names if class_names else ['No CHD', 'CHD'],
            filled=True,
            rounded=True,
            fontsize=10
        )
        plt.title("Decision Tree Visualization")
        plt.show()

    def main():
    # Load dataset
    df = pd.read_csv("framingham.csv")

    # Initialize cleaning class with the dataset
    cleaner = FraminghamDataCleaning(df=df)

    # Impute missing values
    df = cleaner.impute_nulls()

    # Remove outliers (interactive prompt inside)
    df = cleaner.review_and_remove_outliers(column_cat=["male", "education", "currentSmoker"], df=df)

    # Scale numeric columns
    df = cleaner.scale_numeric_columns(df=df, columns_cat=["male", "education", "currentSmoker"])

    # Now pass the cleaned `df` to your model
    model = DecisionTreeClassifierModel(
        df=df,
        target_column="TenYearCHD",
        max_depth=5,
        use_smote=True
    )

    model.run_pipeline()
    model.visualize_decision_tree()

if __name__ == "__main__":
    main()



: 