In [1]:
import pandas as pd # type: ignore
import numpy as np  # type: ignore
from sklearn.model_selection import train_test_split  # type: ignore
from sklearn.preprocessing import StandardScaler, LabelEncoder  # type: ignore
from sklearn.ensemble import RandomForestClassifier  # type: ignore
from sklearn.linear_model import LogisticRegression  # type: ignore
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score  # type: ignore

# Load the dataset
df = pd.read_csv("heart.csv")

# Encode categorical variables
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Split features and target
X = df.drop(columns=["HeartDisease"])  # Features
y = df["HeartDisease"]  # Target

# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Train Logistic Regression Model
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Model Evaluation
log_pred = log_model.predict(X_test)
rf_pred = rf_model.predict(X_test)

print(f"Logistic Regression Accuracy: {accuracy_score(y_test, log_pred):.2f}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred):.2f}")
print("Logistic Regression Report:\n", classification_report(y_test, log_pred))
print("Random Forest Report:\n", classification_report(y_test, rf_pred))
print(f"Logistic Regression AUC: {roc_auc_score(y_test, log_pred):.2f}")
print(f"Random Forest AUC: {roc_auc_score(y_test, rf_pred):.2f}")

# Predict on New Data
new_data = np.array([[60, 1, 2, 130, 200, 1, 1, 150, 1, 2.5, 1]])  # Example values
new_data_scaled = scaler.transform(new_data)
prediction = rf_model.predict(new_data_scaled)
print("Heart Disease Prediction (1=Risk, 0=No Risk):", prediction[0])





Logistic Regression Accuracy: 0.87
Random Forest Accuracy: 0.88
Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.90      0.79      0.84        82
           1       0.85      0.93      0.89       102

    accuracy                           0.87       184
   macro avg       0.88      0.86      0.87       184
weighted avg       0.87      0.87      0.87       184

Random Forest Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.86        82
           1       0.88      0.89      0.89       102

    accuracy                           0.88       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.88      0.87       184

Logistic Regression AUC: 0.86
Random Forest AUC: 0.87
Heart Disease Prediction (1=Risk, 0=No Risk): 1




In [2]:
import joblib

# Save the Random Forest model
joblib.dump(rf_model, 'rf_model.pkl')

# Save the Label Encoder
joblib.dump(le, 'label_encoder.pkl')

['label_encoder.pkl']

In [3]:
# New data for prediction
new_data = pd.DataFrame({
    'Age': [40],
    'Sex': ['F'],
    'ChestPainType': ['ATA'],
    'RestingBP': [140],
    'Cholesterol': [289],
    'FastingBS': [0],
    'RestingECG': ['Normal'],
    'MaxHR': [172],
    'ExerciseAngina': ['N'],
    'Oldpeak': [0],
    'ST_Slope': ['Up']
})

# Encode categorical variables in the new data
for col in categorical_cols:
    if col in new_data.columns:
        new_data[col] = new_data[col].map(lambda s: 'Unknown' if s not in le.classes_ else s)
        le.classes_ = np.append(le.classes_, 'Unknown')
        new_data[col] = le.transform(new_data[col])

# Convert to numeric type
new_data_encoded = new_data.astype(float)

# Standardize the new data
new_data_scaled = scaler.transform(new_data_encoded)

# Predict using the Random Forest model
prediction = rf_model.predict(new_data_scaled)
print("Heart Disease Prediction (1=Risk, 0=No Risk):", prediction[0])

Heart Disease Prediction (1=Risk, 0=No Risk): 0
