In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
# Load the data
data = pd.read_csv('hearts.csv')




In [4]:
# Display the first few rows to ensure the data is loaded correctly
print(data.head())





   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  risk_factor_count  
0   2     3       0                  0  
1   0     3       0                  2  
2   0     3       0                  1  
3   1     3       0                  0  
4   3     2       0                  1  


In [5]:
# Generate new features
# Age Grouping
data['age_group'] = pd.cut(data['age'], bins=[0, 40, 50, 60, 70, 100], labels=['<40', '40-50', '50-60', '60-70', '>70'])

# Calculate risk factor count from 'fbs' and 'exang'
data['risk_factor_count'] = data[['fbs', 'exang']].sum(axis=1)

# Confirm the new features are created
print(data[['age', 'age_group', 'fbs', 'exang', 'risk_factor_count']].head())


   age age_group  fbs  exang  risk_factor_count
0   52     50-60    0      0                  0
1   53     50-60    1      1                  2
2   70     60-70    0      1                  1
3   61     60-70    0      0                  0
4   62     60-70    1      0                  1


In [6]:
# Convert categorical features to numerical using one-hot encoding
data = pd.get_dummies(data, columns=['age_group', 'cp', 'restecg', 'slope', 'thal'], drop_first=True)

# Prepare the data for modeling
X = data.drop(['target'], axis=1)
y = data['target']


In [7]:
# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA for dimensionality reduction (optional step for visualizing importance of features)
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)


In [8]:
# Feature importance using Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)
importances = rf.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Display feature importance
print(feature_importance)


              Feature  Importance
8                  ca    0.117200
5             thalach    0.106101
7             oldpeak    0.098033
22             thal_2    0.077613
0                 age    0.077420
23             thal_3    0.074365
3                chol    0.067359
2            trestbps    0.067018
6               exang    0.057745
20            slope_2    0.035743
15               cp_2    0.034819
1                 sex    0.030488
9   risk_factor_count    0.028622
19            slope_1    0.027480
17          restecg_1    0.019487
16               cp_3    0.016033
14               cp_1    0.015083
11    age_group_50-60    0.011823
12    age_group_60-70    0.011593
10    age_group_40-50    0.009431
4                 fbs    0.006405
21             thal_1    0.006143
13      age_group_>70    0.003052
18          restecg_2    0.000943


In [9]:
# Selecting important features (based on an arbitrary threshold)
important_features = feature_importance[feature_importance['Importance'] > 0.05]['Feature']
X_important = data[important_features]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_important, y, test_size=0.3, random_state=42)



In [10]:
# Model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.9902597402597403
