# DATA PREPARATION

In [2]:
import pandas as pd
heart_dataset:pd.DataFrame = pd.read_csv("~/git/thesis_thallasemia/heartv1.csv", header=0)
df:pd.DataFrame = heart_dataset.copy()

# renamed the headers(target to "heart disease diagnosis(old target)" and thal to "target(thal)")
df.rename(columns={
	'target': 'heart disease diagnosis',
	'thal': 'target(thal)',
	'sex': 'is_male'
}, inplace=True)

# Map sex to binary
df['is_male'] = df['is_male'].map({'male': 1, 'female': 0})
df

Unnamed: 0,is_male,age,cp,resting_BP,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,target(thal),Max Heart Rate Reserve,Heart Disease Risnume,heart disease diagnosis
0,1,65,0,130,254,0,0,147,0,1.4,1,1,3,10,11.44,0
1,1,53,0,140,261,0,0,186,1,0.0,2,0,2,-17,13.51,1
2,1,50,0,122,222,0,0,186,0,0.0,2,0,2,-14,9.46,1
3,0,47,1,112,160,0,1,138,0,0.0,1,0,2,37,8.34,1
4,1,47,0,142,309,0,0,147,1,0.0,1,3,3,28,13.43,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030,0,45,2,145,195,0,1,180,1,3.0,1,0,2,-5,15.90,1
1031,1,53,0,125,210,0,1,165,0,0.8,1,1,3,2,13.20,0
1032,0,60,1,130,240,1,0,150,1,2.2,2,2,1,10,18.80,1
1033,1,67,2,155,205,0,1,140,0,1.1,0,0,2,13,16.10,0


# CORRELATION MATRIX

In [15]:
# correlation_matrix = df.corr(method="spearman").round(2)
# correlation_matrix.style.background_gradient()
df.corr().round(2)

Unnamed: 0,is_male,age,cp,resting_BP,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,target(thal),Max Heart Rate Reserve,Heart Disease Risnume,heart disease diagnosis
is_male,1.0,-0.1,-0.04,-0.07,-0.19,0.02,-0.05,-0.05,0.13,0.08,-0.03,0.1,0.2,0.1,-0.01,-0.29
age,-0.1,1.0,-0.07,0.27,0.22,0.12,-0.13,-0.39,0.08,0.21,-0.17,0.27,0.07,-0.0,0.59,-0.23
cp,-0.04,-0.07,1.0,0.04,-0.08,0.08,0.04,0.3,-0.4,-0.17,0.13,-0.18,-0.16,-0.3,-0.26,0.43
resting_BP,-0.07,0.27,0.04,1.0,0.13,0.18,-0.12,-0.04,0.06,0.19,-0.13,0.1,0.06,-0.07,0.4,-0.14
chol,-0.19,0.22,-0.08,0.13,1.0,0.03,-0.15,-0.02,0.07,0.06,-0.01,0.07,0.1,-0.07,0.38,-0.1
fbs,0.02,0.12,0.08,0.18,0.03,1.0,-0.11,-0.01,0.05,0.01,-0.05,0.14,-0.05,-0.04,0.44,-0.03
restecg,-0.05,-0.13,0.04,-0.12,-0.15,-0.11,1.0,0.05,-0.07,-0.05,0.08,-0.08,-0.02,0.0,-0.18,0.13
thalach,-0.05,-0.39,0.3,-0.04,-0.02,-0.01,0.05,1.0,-0.37,-0.34,0.39,-0.2,-0.1,-0.92,-0.41,0.42
exang,0.13,0.08,-0.4,0.06,0.07,0.05,-0.07,-0.37,1.0,0.32,-0.26,0.11,0.19,0.37,0.71,-0.43
oldpeak,0.08,0.21,-0.17,0.19,0.06,0.01,-0.05,-0.34,0.32,1.0,-0.57,0.22,0.19,0.28,0.34,-0.43


In [5]:
# Extract the row for 'target(thal)' and sort it
df.corr().round(2).loc['target(thal)'].reindex(
    df.corr().round(2).loc['target(thal)'].abs().sort_values(ascending=False).index
)

target(thal)               1.00
heart disease diagnosis   -0.34
is_male                    0.20
exang                      0.19
oldpeak                    0.19
cp                        -0.16
Heart Disease Risnume      0.16
ca                         0.14
chol                       0.10
thalach                   -0.10
slope                     -0.10
Max Heart Rate Reserve     0.08
age                        0.07
resting_BP                 0.06
fbs                       -0.05
restecg                   -0.02
Name: target(thal), dtype: float64

# PREDICTIVE POWER SCORE

In [16]:
import ppscore as pps

pps_matrix = pps.matrix(df)
pps_target = pps.predictors(df, y="target(thal)")
print(pps_target)

                          x             y   ppscore        case  \
0     Heart Disease Risnume  target(thal)  0.725841  regression   
1                      chol  target(thal)  0.372011  regression   
2                  thalach   target(thal)  0.158328  regression   
3    Max Heart Rate Reserve  target(thal)  0.091912  regression   
4   heart disease diagnosis  target(thal)  0.068732  regression   
5                   oldpeak  target(thal)  0.064626  regression   
6                resting_BP  target(thal)  0.008096  regression   
7                   is_male  target(thal)  0.000000  regression   
8                       age  target(thal)  0.000000  regression   
9                        cp  target(thal)  0.000000  regression   
10                      fbs  target(thal)  0.000000  regression   
11                  restecg  target(thal)  0.000000  regression   
12                    exang  target(thal)  0.000000  regression   
13                    slope  target(thal)  0.000000  regressio

# FEATURE IMPORTANCE (DATA PREP)

In [17]:
from sklearn.model_selection import train_test_split

df_feature_importance_RF:pd.DataFrame = df.copy()

# Drop rows with missing values if any
df_feature_importance_RF.dropna(inplace=True)

# Define features and target
x:pd.DataFrame = df_feature_importance_RF.drop(columns=['target(thal)'])  # or just keep the column you want as target
y:pd.DataFrame = df_feature_importance_RF['target(thal)']  # or use 'heart disease diagnosis(old target)' depending on what you're analyzing

# Train/test split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

# FEATURE IMPORTANCE (RANDOM FOREST)

In [18]:
from sklearn.ensemble import RandomForestClassifier

# Train model
clf = RandomForestClassifier(random_state=42)
clf.fit(x_train, y_train)

# Get feature importance
importances = clf.feature_importances_
feature_names = x.columns
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(feature_importance_df)

                    feature  importance
13    Heart Disease Risnume    0.120602
9                   oldpeak    0.104883
14  heart disease diagnosis    0.099129
7                  thalach     0.094688
4                      chol    0.093843
3                resting_BP    0.087437
12   Max Heart Rate Reserve    0.086227
1                       age    0.078456
2                        cp    0.054921
0                   is_male    0.053685
11                       ca    0.033178
10                    slope    0.029241
6                   restecg    0.024376
8                     exang    0.024129
5                       fbs    0.015204


# FEATURE IMPORTANCE (DECISION TREES)

In [20]:
from sklearn.tree import DecisionTreeClassifier

# Train model
tree = DecisionTreeClassifier(random_state=42)
tree.fit(x_train, y_train)

# Feature importance
importances = tree.feature_importances_
feature_names = x.columns
feature_importance_df = pd.DataFrame({
    'importance': importances,
    'feature': feature_names
}).sort_values(by='importance', ascending=False)

print(feature_importance_df)

    importance                  feature
14    0.209875  heart disease diagnosis
13    0.129596    Heart Disease Risnume
4     0.114734                     chol
12    0.107855   Max Heart Rate Reserve
7     0.104914                 thalach 
9     0.101584                  oldpeak
1     0.062238                      age
3     0.054817               resting_BP
0     0.036372                  is_male
11    0.033122                       ca
5     0.023714                      fbs
2     0.011880                       cp
10    0.009300                    slope
6     0.000000                  restecg
8     0.000000                    exang
