In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from joblib import dump
from sklearn import metrics
import pandas as pd
import time

In [26]:
# Function to evaluate classification performance
def evaluate_classification(model, name, X_train, X_test, y_train, y_test):
    start_time = time.time()
    train_accuracy = metrics.accuracy_score(y_train, model.predict(X_train))
    test_accuracy = metrics.accuracy_score(y_test, model.predict(X_test))

    train_precision = metrics.precision_score(y_train, model.predict(X_train), average='weighted', zero_division=1)
    test_precision = metrics.precision_score(y_test, model.predict(X_test), average='weighted', zero_division=1)

    train_recall = metrics.recall_score(y_train, model.predict(X_train), average='weighted', zero_division=1)
    test_recall = metrics.recall_score(y_test, model.predict(X_test), average='weighted', zero_division=1)

    end_time = time.time()
    print(f"Evaluation took {end_time - start_time:.2f} seconds")

    print("Training Set Metrics:")
    print("Training Accuracy {}: {:.2f}%".format(name, train_accuracy * 100))
    print("Training Precision {}: {:.2f}%".format(name, train_precision * 100))
    print("Training Recall {}: {:.2f}%".format(name, train_recall * 100))

    print("\nTest Set Metrics:")
    print("Test Accuracy {}: {:.2f}%".format(name, test_accuracy * 100))
    print("Test Precision {}: {:.2f}%".format(name, test_precision * 100))
    print("Test Recall {}: {:.2f}%".format(name, test_recall * 100))
    

In [27]:
df = pd.read_csv("data\heart.csv")

In [28]:
    # Separate features and target
    X = df.drop('output', axis=1)
    y = df['output']

In [36]:
    steps = [
        ('scaler', StandardScaler()),  # Step 1: Feature scaling
        ('classifier', RandomForestClassifier(n_estimators=150,
                            min_samples_split=10,
                            min_samples_leaf=4,
                            random_state=7))  # Step 2: Logistic Regression
    ]

In [37]:
pipeline = Pipeline(steps)

In [38]:
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [39]:
X_train

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
245,48,1,0,124,274,0,0,166,0,0.5,1,0,3
267,49,1,2,118,149,0,0,126,0,0.8,2,3,2
268,54,1,0,122,286,0,0,116,1,3.2,1,2,2
150,66,1,0,160,228,0,0,138,0,2.3,2,0,1
168,63,1,0,130,254,0,0,147,0,1.4,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,61,1,0,120,260,0,1,140,1,3.6,1,1,3
67,45,0,1,130,234,0,0,175,0,0.6,1,0,2
25,71,0,1,160,302,0,1,162,0,0.4,2,2,2
196,46,1,2,150,231,0,1,147,0,3.6,1,0,2


In [40]:
    # Fit the model using the pipeline
    pipeline.fit(X_train, y_train)


In [41]:
    # Save the model
    dump(pipeline, 'model_pipeline.joblib')

['model_pipeline.joblib']

In [42]:
    evaluate_classification(pipeline, "RandomForest", X_train, X_test, y_train, y_test)

Evaluation took 0.07 seconds
Training Set Metrics:
Training Accuracy RandomForest: 93.39%
Training Precision RandomForest: 93.54%
Training Recall RandomForest: 93.39%

Test Set Metrics:
Test Accuracy RandomForest: 72.13%
Test Precision RandomForest: 74.12%
Test Recall RandomForest: 72.13%
