In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

df = pd.read_csv('fhs_train.csv')

if 'Unnamed: 0' in df.columns:
    df.drop('Unnamed: 0', axis=1, inplace=True)

print(df.head())
print(df.isnull().sum())

In [None]:
# Fill missing values using median for numerical columns
imputer = SimpleImputer(strategy='median')
df_filled = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
# Scale the features
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_filled), columns=df.columns)

In [None]:
# Split data into features and target variable
X = df_scaled.drop('TenYearCHD', axis=1)
y = df_scaled['TenYearCHD']

# Convert y to binary
y = (y > 0).astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree model
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

# Plot the tree
plt.figure(figsize=(20,10))
plot_tree(tree_model, filled=True, feature_names=list(X.columns), class_names=['No CHD', 'Yes CHD'])
plt.show()

In [None]:
importances = tree_model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {indices[f]} ({importances[indices[f]]}): {X_train.columns[indices[f]]}")