In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# 1. Load and Prepare Data
df = pd.read_csv("heart.csv")

# Features and Target
X = df.drop("target", axis=1)
y = df["target"]

print("Heart Dataset (first 5 rows of features):")
print(X.head())
print("\nTarget Variable (first 5 rows):")
print(y.head())

Heart Dataset (first 5 rows of features):
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  
0   2     3  
1   0     3  
2   0     3  
3   1     3  
4   3     2  

Target Variable (first 5 rows):
0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64


In [5]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

print("Scaled Heart Data (first 5 rows):")
print(X_scaled_df.head())

Scaled Heart Data (first 5 rows):
        age       sex        cp  trestbps      chol       fbs   restecg  \
0 -0.268437  0.661504 -0.915755 -0.377636 -0.659332 -0.418878  0.891255   
1 -0.158157  0.661504 -0.915755  0.479107 -0.833861  2.387330 -1.004049   
2  1.716595  0.661504 -0.915755  0.764688 -1.396233 -0.418878  0.891255   
3  0.724079  0.661504 -0.915755  0.936037 -0.833861 -0.418878  0.891255   
4  0.834359 -1.511706 -0.915755  0.364875  0.930822  2.387330  0.891255   

    thalach     exang   oldpeak     slope        ca      thal  
0  0.821321 -0.712287 -0.060888  0.995433  1.209221  1.089852  
1  0.255968  1.403928  1.727137 -2.243675 -0.731971  1.089852  
2 -1.048692  1.403928  1.301417 -2.243675 -0.731971  1.089852  
3  0.516900 -0.712287 -0.912329  0.995433  0.238625  1.089852  
4 -1.874977 -0.712287  0.705408 -0.624121  2.179817 -0.522122  


In [6]:
# 2. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, {y_test.shape}")

Training data shape: (717, 13), (717,)
Testing data shape: (308, 13), (308,)


In [7]:
# 3. Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

print("Random Forest Model Trained Successfully!")

Random Forest Model Trained Successfully!


In [8]:
# 4. Make Predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)

print("Predictions on Test Set (first 10):")
predictions_df = pd.DataFrame({
    'Actual_Label': y_test.reset_index(drop=True),
    'Predicted_Label': y_pred
})
print(predictions_df.head(10))

Predictions on Test Set (first 10):
   Actual_Label  Predicted_Label
0             0                0
1             0                0
2             1                1
3             1                1
4             1                1
5             1                1
6             0                0
7             1                1
8             0                0
9             1                1


In [9]:
# 5. Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)
print("\n" + "="*50 + "\n")

# Feature Importance
print("Feature Importances:")
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances)

Model Accuracy: 0.99

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       150
           1       1.00      0.98      0.99       158

    accuracy                           0.99       308
   macro avg       0.99      0.99      0.99       308
weighted avg       0.99      0.99      0.99       308



Feature Importances:
     Feature  Importance
2         cp    0.140129
7    thalach    0.131526
11        ca    0.116020
9    oldpeak    0.105527
12      thal    0.099114
0        age    0.098053
4       chol    0.081334
3   trestbps    0.067787
8      exang    0.061730
10     slope    0.037089
1        sex    0.032196
6    restecg    0.019197
5        fbs    0.010297


In [10]:
# 6. Classify a New Sample
print("Classifying a New Sample with Random Forest:")

# Example sample (manually picked based on average values)
new_sample = pd.DataFrame([[57, 1, 0, 130, 236, 0, 2, 174, 0, 0.0, 1, 1, 2]],
                          columns=X.columns)

print(f"\nNew Sample to classify:\n{new_sample}")

# Scale the new sample
new_sample_scaled = scaler.transform(new_sample)
new_sample_scaled_df = pd.DataFrame(new_sample_scaled, columns=X.columns)

print(f"\nNew Sample Scaled:\n{new_sample_scaled_df}")

# Predict class and probability
new_pred_label = rf_model.predict(new_sample_scaled)[0]
new_pred_proba = rf_model.predict_proba(new_sample_scaled)[0]

label_map = {0: "No Heart Disease", 1: "Heart Disease"}
print(f"\nThe new sample is predicted to be: '{label_map[new_pred_label]}'")
print("\nProbabilities:")
print(f"  No Heart Disease (0): {new_pred_proba[0]:.4f}")
print(f"  Heart Disease    (1): {new_pred_proba[1]:.4f}")
print("\n" + "="*50 + "\n")


Classifying a New Sample with Random Forest:

New Sample to classify:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   57    1   0       130   236    0        2      174      0      0.0      1   

   ca  thal  
0   1     2  

New Sample Scaled:
        age       sex        cp  trestbps      chol       fbs   restecg  \
0  0.282961  0.661504 -0.915755 -0.092055 -0.193921 -0.418878  2.786558   

    thalach     exang   oldpeak     slope        ca      thal  
0  1.082252 -0.712287 -0.912329 -0.624121  0.238625 -0.522122  

The new sample is predicted to be: 'No Heart Disease'

Probabilities:
  No Heart Disease (0): 0.7800
  Heart Disease    (1): 0.2200


