In [52]:
# Import necessary libraries
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [53]:
# Loading the dataset
file_path = os.path.join("..","data", "heart_attack_prediction", "heart.csv")
df = pd.read_csv(file_path)

In [54]:
df.FastingBS.value_counts()

0    704
1    214
Name: FastingBS, dtype: int64

In [55]:
# Splitting the dataset into categorical and numerical columns
cat_columns = ['Sex', 'ChestPainType', 'RestingECG','ExerciseAngina', 'ST_Slope']
num_columns = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]

# Since FastingBS is already a catagorical column with only 0, and 1 instead of normalizing it keep it as it is
cat_num_columns = ["FastingBS"]

In [56]:
# Splitting into data (X) and labels (y)
X = df.drop("HeartDisease", axis = 1)
y = df["HeartDisease"]
y.value_counts()


1    508
0    410
Name: HeartDisease, dtype: int64

The data is a little unbalanced. Hence, I'll use two approaches:
* Split the data randomly without taking into account class imbalance. 
* Split the data while accounting for class imbalance.


## Random Splitting

In [57]:
def data_split(X, y, test_size = 0.2, randomized = True):
    if randomized:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [58]:
X_train, X_test, y_train, y_test = data_split(X, y, test_size = 0.2, randomized = True)
    
# One hotencoding the catagorical variables
cat_data_train = pd.get_dummies(X_train[cat_columns],drop_first=True).reset_index().drop("index", axis = 1)
cat_data_test = pd.get_dummies(X_train[cat_columns],drop_first=True).reset_index().drop("index", axis = 1)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
num_data_train = pd.DataFrame(scaler.fit_transform(X_train[num_columns]), columns=num_columns)
num_data_test = pd.DataFrame(scaler.transform(X_test[num_columns]), columns=num_columns)

# Concatinating all data to get the final dataset for Machine learning
train_data = pd.concat([cat_data_train, num_data_train, X_train[cat_num_columns].reset_index().drop("index", axis = 1)], axis = 1)
test_data = pd.concat([cat_data_test, num_data_test, X_test[cat_num_columns].reset_index().drop("index", axis = 1)], axis = 1)

In [59]:
# Splitting into test and train set so that all normalizations can be done on train set alone
X_train, X_test, y_train, y_test = data_split(X, y, test_size = 0.2, randomized = True)

In [60]:
x = pd.Series()
x.empty

  x = pd.Series()


True

In [61]:
def data_preprocess(X_train, y_train, X_test = pd.Series(), y_test = pd.Series()):
    if not X_test.empty: 
        # One hotencoding the catagorical variables
        cat_data_train = pd.get_dummies(X_train[cat_columns],drop_first=True).reset_index().drop("index", axis = 1)
        cat_data_test = pd.get_dummies(X_test[cat_columns],drop_first=True).reset_index().drop("index", axis = 1)

        # Standardize features by removing the mean and scaling to unit variance
        scaler = StandardScaler()
        num_data_train = pd.DataFrame(scaler.fit_transform(X_train[num_columns]), columns=num_columns)
        num_data_test = pd.DataFrame(scaler.transform(X_test[num_columns]), columns=num_columns)

        # Concatinating all data to get the final dataset for Machine learning
        train_data = pd.concat([cat_data_train, num_data_train, X_train[cat_num_columns].reset_index().drop("index", axis = 1)], axis = 1)
        test_data = pd.concat([cat_data_test, num_data_test, X_test[cat_num_columns].reset_index().drop("index", axis = 1)], axis = 1)
        
        # Returning the test and tran data alonw with labels
        return train_data, test_data, y_train.reset_index().drop("index", axis = 1), y_test.reset_index().drop("index", axis = 1)
    else:
        # One hotencoding the catagorical variables
        cat_data_train = pd.get_dummies(X_train[cat_columns],drop_first=True)

        # Standardize features by removing the mean and scaling to unit variance
        scaler = StandardScaler()
        num_data_train = pd.DataFrame(scaler.fit_transform(X_train[num_columns]), columns=num_columns)

        # Concatinating all data to get the final dataset for Machine learning
        train_data = pd.concat([cat_data_train, num_data_train, X_train[cat_num_columns]], axis = 1)
        
        # Returning the test and tran data alonw with labels
        return train_data, y_train

  def data_preprocess(X_train, y_train, X_test = pd.Series(), y_test = pd.Series()):
  def data_preprocess(X_train, y_train, X_test = pd.Series(), y_test = pd.Series()):


In [62]:
train_data, test_data, y_train, y_test = data_preprocess(X_train, y_train, X_test, y_test)

In [70]:
# Starting with a simple decision tree classifier
clf = DecisionTreeClassifier(random_state=0)

In [71]:
clf.fit(train_data, y_train)

Get the dept of the full tree to use for pre pruning using max_depth feature.

In [73]:
max_depth = clf.get_depth()
print(max_depth)

14


In [65]:
y_pred = clf.predict(test_data)

In [66]:
acc = accuracy_score(y_test, y_pred)
acc

0.842391304347826

In [67]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.81      0.81        77
           1       0.86      0.87      0.87       107

    accuracy                           0.84       184
   macro avg       0.84      0.84      0.84       184
weighted avg       0.84      0.84      0.84       184



In [68]:
X_processed, y = data_preprocess(X, y)
cross_val_score(clf, X_processed, y, cv=10)

array([0.81521739, 0.83695652, 0.75      , 0.7173913 , 0.82608696,
       0.77173913, 0.75      , 0.65217391, 0.71428571, 0.71428571])

In [None]:
plt.figure(figsize=(10,10))
plot_tree(clf)

## Performing Pruning

* Prepruning by controlling the max depth
* Post pruning using cost complexity pruning

### Prepruning

In [74]:
clfs = []
for depth in range(1, max_depth + 1):
    clf = DecisionTreeClassifier(max_depth= depth, random_state=0)
    clf.fit(train_data, y_train)
    clfs.append(clf)
    

In [None]:
def plot_scores(clfs, hyper_param = [], hyper_param_names = []):
    train_scores = [clf.score(train_data, y_train) for clf in clfs]
    test_scores = [clf.score(test_data, y_test) for clf in clfs]

    fig, ax = plt.subplots(figsize=(10,10))
    ax.set_xlabel("alpha")
    ax.set_ylabel("accuracy")
    ax.set_title("Accuracy vs alpha for training and testing sets")
    ax.plot(ccp_alphas[:-1], train_scores[:-1], marker="o", label="train", drawstyle="steps-post")
    ax.plot(ccp_alphas[:-1], test_scores[:-1], marker="o", label="test", drawstyle="steps-post")
    ax.legend()
    plt.show()


### Post pruning: Cost complexity pruning

In [46]:
clf_pruned = DecisionTreeClassifier(random_state=0)
path = clf_pruned.cost_complexity_pruning_path(train_data, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [48]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(train_data, y_train)
    clfs.append(clf)
    print(
        "Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
            clfs[-1].tree_.node_count, ccp_alpha
        )
    )

Number of nodes in the last tree is: 231 with ccp_alpha: 0.0
Number of nodes in the last tree is: 223 with ccp_alpha: 0.0006698455949137152
Number of nodes in the last tree is: 211 with ccp_alpha: 0.0008763962585847
Number of nodes in the last tree is: 205 with ccp_alpha: 0.000899272488556757
Number of nodes in the last tree is: 201 with ccp_alpha: 0.0011353315168029065
Number of nodes in the last tree is: 197 with ccp_alpha: 0.0011677695601401326
Number of nodes in the last tree is: 189 with ccp_alpha: 0.0012534059945504086
Number of nodes in the last tree is: 185 with ccp_alpha: 0.0012569951702860113
Number of nodes in the last tree is: 181 with ccp_alpha: 0.001257597987843219
Number of nodes in the last tree is: 177 with ccp_alpha: 0.0012650836901518104
Number of nodes in the last tree is: 173 with ccp_alpha: 0.0012715712988192556
Number of nodes in the last tree is: 169 with ccp_alpha: 0.001298603001600274
Number of nodes in the last tree is: 165 with ccp_alpha: 0.00130563124432334

# Support Vector Machines

In [75]:
# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Create an SVM classifier with a radial basis function (RBF) kernel
svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale', decision_function_shape='ovr')

# Train the SVM classifier
svm_classifier.fit(train_data, y_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(test_data)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("\nClassification Report:\n", report)

Accuracy: 0.8532608695652174

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.82      0.82        77
           1       0.87      0.88      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.85      0.85       184
weighted avg       0.85      0.85      0.85       184



  y = column_or_1d(y, warn=True)


In [76]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import numpy as np

def perform_cross_validation(X, y, kernel_type='linear', num_folds=5):
    """
    Perform cross-validation for Support Vector Classification (SVC) with different kernels.

    Parameters:
    - X: Input features.
    - y: Target variable.
    - kernel_type: Type of kernel to be used ('linear', 'poly', 'rbf', 'sigmoid', or 'custom').
    - num_folds: Number of folds for cross-validation.

    Returns:
    - Mean accuracy across folds.
    """

    if kernel_type == 'custom':
        # If using a custom kernel, define your custom kernel function here
        def custom_kernel(x, y):
            # Your custom kernel function
            return np.dot(x, y.T)

        clf = SVC(kernel=custom_kernel)
    else:
        clf = SVC(kernel=kernel_type)

    # Perform cross-validation and get accuracy scores for each fold
    scores = cross_val_score(clf, X, y, cv=num_folds)

    # Return the mean accuracy across folds
    return np.mean(scores)

# Example usage:
# Assuming you have your features (X) and target variable (y) ready

# Example with Linear Kernel
linear_accuracy = perform_cross_validation(X_processed, y, kernel_type='linear')
print(f'Linear Kernel Accuracy: {linear_accuracy:.4f}')

# Example with Polynomial Kernel
poly_accuracy = perform_cross_validation(X_processed, y, kernel_type='poly')
print(f'Polynomial Kernel Accuracy: {poly_accuracy:.4f}')

# Example with RBF Kernel
rbf_accuracy = perform_cross_validation(X_processed, y, kernel_type='rbf')
print(f'RBF Kernel Accuracy: {rbf_accuracy:.4f}')

# Example with Sigmoid Kernel
sigmoid_accuracy = perform_cross_validation(X_processed, y, kernel_type='sigmoid')
print(f'Sigmoid Kernel Accuracy: {sigmoid_accuracy:.4f}')

# Example with Custom Kernel
custom_accuracy = perform_cross_validation(X_processed, y, kernel_type='custom')
print(f'Custom Kernel Accuracy: {custom_accuracy:.4f}')

Linear Kernel Accuracy: 0.8267
Polynomial Kernel Accuracy: 0.8256
RBF Kernel Accuracy: 0.8311
Sigmoid Kernel Accuracy: 0.7527
Custom Kernel Accuracy: 0.8267
