# DATA PREPARATION

In [1]:
import pandas as pd
heart_dataset:pd.DataFrame = pd.read_csv("~/git/thesis_thallasemia/heartv1.csv", header=0)
df:pd.DataFrame = heart_dataset.copy()

# renamed the headers(target to "heart disease diagnosis(old target)" and thal to "target(thal)")
df.rename(columns={
	'target': 'heart disease diagnosis',
	'thal': 'target(thal)',
	'sex': 'is_male'
}, inplace=True)

# Map sex to binary
df['is_male'] = df['is_male'].map({'male': 1, 'female': 0})

# Drop rows with missing values if any
df.dropna(inplace=True)

# Define features and target
x:pd.DataFrame = df.drop(columns=['target(thal)'])  #used to predict column
y:pd.Series = df['target(thal)']  #target column

# training and testing split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [None]:
# features from the correlation matrix

features_cm = [
    "heart disease diagnosis", 
    "is_male", 
    "exang", 
    "oldpeak"
]
x_train_cm:pd.DataFrame = x_train.copy()
x_train_cm = x_train_cm[features_cm]
x_train_cm

Unnamed: 0,heart disease diagnosis,is_male,exang,oldpeak
306,0,0,1,4.0
221,1,0,0,0.0
235,0,1,0,2.4
342,0,1,1,3.6
308,0,1,0,0.0
...,...,...,...,...
87,1,0,0,1.8
330,0,0,0,0.0
466,1,0,0,1.6
121,0,1,0,0.0


In [None]:
# features from the ppscore

features_pp = [
    "Heart Disease Risnume", 
	"chol", 
	"thalach ", 
	"Max Heart Rate Reserve", 
	"heart disease diagnosis", 
	"oldpeak"
]
x_train_pp:pd.DataFrame = x_train.copy()
# print(x_train_pp.columns)
x_train_pp = x_train_pp[]
x_train_pp

Unnamed: 0,Heart Disease Risnume,chol,thalach,Max Heart Rate Reserve,heart disease diagnosis,oldpeak
306,17.48,288,133,31,0,4.0
221,13.95,265,130,19,1,0.0
235,12.82,322,109,41,0,2.4
342,13.52,166,125,34,0,3.6
308,9.74,230,160,8,0,0.0
...,...,...,...,...,...,...
87,12.09,239,151,0,1,1.8
330,12.00,330,169,-10,0,0.0
466,9.74,214,158,8,1,1.6
121,11.30,300,171,-9,0,0.0


In [4]:
# features from the feature importance random forest
# These 8 features account for ~76.6% of total importance

x_train_rf:pd.DataFrame = x_train.copy()
# print(x_train_rf.columns)
x_train_rf = x_train_rf[[
    "Heart Disease Risnume",
    "oldpeak",
    "heart disease diagnosis",
    "thalach ",
    "chol",
    "resting_BP",
    "Max Heart Rate Reserve",
    "age"
]]
x_train_rf

Unnamed: 0,Heart Disease Risnume,oldpeak,heart disease diagnosis,thalach,chol,resting_BP,Max Heart Rate Reserve,age
306,17.48,4.0,0,133,288,200,31,58
221,13.95,0.0,1,130,265,110,19,73
235,12.82,2.4,0,109,322,130,41,72
342,13.52,3.6,0,125,166,138,34,63
308,9.74,0.0,0,160,230,112,8,54
...,...,...,...,...,...,...,...,...
87,12.09,1.8,1,151,239,140,0,71
330,12.00,0.0,0,169,330,130,-10,63
466,9.74,1.6,1,158,214,110,8,56
121,11.30,0.0,0,171,300,125,-9,60


In [5]:
# features from the feature importance random forest
# easonable cutoff of ~0.05 and above
# is currently thesame as random forest but would have had different features if we used a smaller cutoff

x_train_dt:pd.DataFrame = x_train.copy()
# print(x_train_dt.columns)
x_train_dt = x_train_dt[[
    "heart disease diagnosis",
    "Heart Disease Risnume",
    "chol",
    "Max Heart Rate Reserve",
    "thalach ",
    "oldpeak",
    "age",
    "resting_BP"
]]
x_train_dt

Unnamed: 0,heart disease diagnosis,Heart Disease Risnume,chol,Max Heart Rate Reserve,thalach,oldpeak,age,resting_BP
306,0,17.48,288,31,133,4.0,58,200
221,1,13.95,265,19,130,0.0,73,110
235,0,12.82,322,41,109,2.4,72,130
342,0,13.52,166,34,125,3.6,63,138
308,0,9.74,230,8,160,0.0,54,112
...,...,...,...,...,...,...,...,...
87,1,12.09,239,0,151,1.8,71,140
330,0,12.00,330,-10,169,0.0,63,130
466,1,9.74,214,8,158,1.6,56,110
121,0,11.30,300,-9,171,0.0,60,125


# TRAINING OF ALL THE RANDOM FORESTS
for each set of relevant features

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_cm = RandomForestClassifier(random_state=42)
model_cm.fit(x_train_cm, y_train)
pred_cm = model_cm.predict(x_test)

In [None]:
model_pp = RandomForestClassifier(random_state=42)
model_pp.fit(x_train_pp, y_train)

In [None]:
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(x_train_rf, y_train)

In [None]:
model_dt = RandomForestClassifier(random_state=42)
model_dt.fit(x_train_dt, y_train)