# DATA PREPARATION

In [1]:
import pandas as pd
heart_dataset:pd.DataFrame = pd.read_csv("~/git/thesis_thallasemia/heartv1.csv", header=0)
df:pd.DataFrame = heart_dataset.copy()

# renamed the headers(target to "heart disease diagnosis(old target)" and thal to "target(thal)")
df.rename(columns={
	'target': 'heart disease diagnosis',
	'thal': 'target(thal)',
	'sex': 'is_male'
}, inplace=True)

# Map sex to binary
df['is_male'] = df['is_male'].map({'male': 1, 'female': 0})

# FEATURE IMPORTANCE (DATA PREP)

In [2]:
from sklearn.model_selection import train_test_split

df_feature_importance_RF:pd.DataFrame = df.copy()

# Drop rows with missing values if any
df_feature_importance_RF.dropna(inplace=True)

# Define features and target
x:pd.DataFrame = df_feature_importance_RF.drop(columns=['target(thal)'])  # or just keep the column you want as target
y:pd.DataFrame = df_feature_importance_RF['target(thal)']  # or use 'heart disease diagnosis(old target)' depending on what you're analyzing

# Train/test split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

# FEATURE IMPORTANCE (RANDOM FOREST)

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Train model
clf = RandomForestClassifier(random_state=42)
clf.fit(x_train, y_train)

# Get feature importance
importances = clf.feature_importances_
feature_names = x.columns
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(feature_importance_df)

                    feature  importance
13    Heart Disease Risnume    0.120602
9                   oldpeak    0.104883
14  heart disease diagnosis    0.099129
7                  thalach     0.094688
4                      chol    0.093843
3                resting_BP    0.087437
12   Max Heart Rate Reserve    0.086227
1                       age    0.078456
2                        cp    0.054921
0                   is_male    0.053685
11                       ca    0.033178
10                    slope    0.029241
6                   restecg    0.024376
8                     exang    0.024129
5                       fbs    0.015204


# SHAPLEY VALUES

In [None]:
import shap


# Get SHAP values
explainer = shap.Explainer(clf, x_train)
shap_values = explainer(x_test)

# Example: SHAP values for first instance
print(shap_values[0].values)

# Optional: summary plot
shap.summary_plot(shap_values, x_test)