In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
# Define the dataset URL
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'

# Define column names as the dataset does not include headers
column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]

# Load the dataset
df = pd.read_csv(url, names=column_names)

# Display first few rows
print(df.head())


In [4]:
# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Convert relevant columns to numeric
for col in ['ca', 'thal']:
    df[col] = pd.to_numeric(df[col])

# Fill missing values with median
df['ca'].fillna(df['ca'].median(), inplace=True)
df['thal'].fillna(df['thal'].median(), inplace=True)


In [5]:
# Define features and target
X = df.drop('target', axis=1)
y = df['target']

# Simplify target variable: 0 indicates absence, 1 indicates presence of heart disease
y = y.apply(lambda x: 1 if x > 0 else 0)


In [6]:
# Initialize scaler
scaler = StandardScaler()

# Fit and transform features
X_scaled = scaler.fit_transform(X)


In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [8]:
# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [9]:
# Predict on test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{cm}')

# Classification report
report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{report}')


Accuracy: 0.87
Confusion Matrix:
[[26  3]
 [ 5 27]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87        29
           1       0.90      0.84      0.87        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [10]:
# Retrieve feature importances
importances = model.feature_importances_

# Create a feature importance dataframe
feature_importance_df = pd.DataFrame({
    'Feature': column_names[:-1],
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)


     Feature  Importance
2         cp    0.127280
7    thalach    0.116200
9    oldpeak    0.113342
11        ca    0.112550
0        age    0.105167
12      thal    0.095484
4       chol    0.079107
3   trestbps    0.074830
10     slope    0.055162
8      exang    0.051821
1        sex    0.035816
6    restecg    0.021773
5        fbs    0.011468


In [18]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

# Initialize Grid Search
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=32),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Best parameters and score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_:.2f}')


Best Parameters: {'max_depth': None, 'n_estimators': 200}
Best Score: 0.82


In [19]:
import joblib

# Save the model
joblib.dump(model, 'heart_disease_prediction_model.pkl')


['heart_disease_prediction_model.pkl']