In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('./heart_attack_prediction_dataset.csv')

In [3]:
df['Heart Attack Risk'].value_counts()

Heart Attack Risk
0    5624
1    3139
Name: count, dtype: int64

In [4]:
df = df.drop(columns='Patient ID')

In [5]:
random_state = 42
target = 'Heart Attack Risk'

In [6]:
df

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,67,Male,208,158/88,72,0,0,1,0,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,21,Male,389,165/93,98,1,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,21,Female,324,174/99,72,1,0,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,84,Male,383,163/100,73,1,1,1,0,1,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,66,Male,318,91/88,93,1,1,1,1,0,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,60,Male,121,94/76,61,1,1,1,0,1,...,10.806373,235420,19.655895,67,7,7,Thailand,Asia,Northern Hemisphere,0
8759,28,Female,120,157/102,73,1,0,0,1,0,...,3.833038,217881,23.993866,617,4,9,Canada,North America,Northern Hemisphere,0
8760,47,Male,250,161/75,105,0,1,1,1,1,...,2.375214,36998,35.406146,527,4,4,Brazil,South America,Southern Hemisphere,1
8761,36,Male,178,119/67,60,1,0,1,0,0,...,0.029104,209943,27.294020,114,2,8,Brazil,South America,Southern Hemisphere,0


In [9]:
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False, fmt='.2f')
plt.title("Корреляционная матрица признаков")
plt.show()

ValueError: could not convert string to float: 'Male'

<Figure size 1200x800 with 0 Axes>

In [17]:
train, test = train_test_split(df, test_size=0.2, random_state=random_state)

In [18]:
train, val = train_test_split(train, test_size=0.25, random_state=random_state)

In [19]:
train.to_parquet('train.parquet')

test.to_parquet('test.parquet')

val.to_parquet('val.parquet')

# FIRST

In [24]:
train = pd.read_parquet('train.parquet')
val = pd.read_parquet('val.parquet')
test = pd.read_parquet('test.parquet')

In [7]:
categorical_columns = train.select_dtypes(include=['object', 'category']).columns

train_1 = train.drop(columns=categorical_columns)
val_1 = val.drop(columns=categorical_columns)
test_1 = test.drop(columns=categorical_columns)

In [8]:
categorical_columns

Index(['Sex', 'Blood Pressure', 'Diet', 'Country', 'Continent', 'Hemisphere'], dtype='object')

In [11]:
model = LogisticRegression()
model.fit(train_1.drop(columns=[target]), train_1[target])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
print(f"TRAIN : {roc_auc_score(train_1[target], model.predict_proba(train_1.drop(columns=[target]))[:, 1])}")
print(f"VAL : {roc_auc_score(val_1[target], model.predict_proba(val_1.drop(columns=[target]))[:, 1])}")
print(f"TEST : {roc_auc_score(test_1[target], model.predict_proba(test_1.drop(columns=[target]))[:, 1])}")

TRAIN : 0.5091452521405606
VAL : 0.48433262561924983
TEST : 0.5198598726114649


In [13]:
print(f"TRAIN : {f1_score(train_1[target], model.predict(train_1.drop(columns=[target])))}")
print(f"VAL : {f1_score(val_1[target], model.predict(val_1.drop(columns=[target])))}")
print(f"TEST : {f1_score(test_1[target], model.predict(test_1.drop(columns=[target])))}")

TRAIN : 0.0
VAL : 0.0
TEST : 0.0


In [16]:
y_pred = model.predict(test_1.drop(columns=[target]))

accuracy = accuracy_score(test_1[target], y_pred)
precision = precision_score(test_1[target], y_pred)
recall = recall_score(test_1[target], y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Accuracy: 0.64
Precision: 0.00
Recall: 0.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
feature_importance = model.coef_[0] 

feature_names = test_1.drop(columns=[target]).columns
importance_table = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print(importance_table)

                            Feature    Importance
8           Exercise Hours Per Week  8.304711e-04
3                          Diabetes  2.017930e-04
0                               Age  1.519793e-04
1                       Cholesterol  1.365166e-04
15                    Triglycerides  7.073199e-05
13                           Income  2.340747e-07
10                   Medication Use -2.492817e-04
9           Previous Heart Problems -2.631311e-04
4                    Family History -3.971105e-04
5                           Smoking -5.791587e-04
7               Alcohol Consumption -8.556932e-04
6                           Obesity -9.419037e-04
2                        Heart Rate -3.511246e-03
16  Physical Activity Days Per Week -4.452408e-03
12          Sedentary Hours Per Day -4.599085e-03
17              Sleep Hours Per Day -6.387217e-03
11                     Stress Level -6.922978e-03
14                              BMI -1.064506e-02


# SECOND

In [37]:
train = pd.read_parquet('train.parquet')
val = pd.read_parquet('val.parquet')
test = pd.read_parquet('test.parquet')

In [38]:
y_train = train[target]
train = train.drop(columns=[target])
y_val = val[target]
val = val.drop(columns=[target])
y_test = test[target]
test = test.drop(columns=[target])


train = pd.get_dummies(train)
val = pd.get_dummies(val)
test = pd.get_dummies(test)

train, val = train.align(val, join='left', axis=1, fill_value=0)
train, test = train.align(test, join='left', axis=1, fill_value=0)

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train)
val_scaled = scaler.transform(val)
test_scaled = scaler.transform(test)

poly = PolynomialFeatures(degree=1)
train_poly = poly.fit_transform(train_scaled)
val_poly = poly.transform(val_scaled)
test_poly = poly.transform(test_scaled)

In [39]:
model = Ridge()
model.fit(train_poly, y_train)

In [40]:
print(f"TRAIN : {roc_auc_score(y_train, model.predict(train_poly))}")
print(f"VAL : {roc_auc_score(y_val, model.predict(val_poly))}")
print(f"TEST : {roc_auc_score(y_test, model.predict(test_poly))}")

TRAIN : 0.9423766637568662
VAL : 0.5008110403397028
TEST : 0.48679971691436663


In [28]:
y_train_pred = model.predict(train_poly)
y_val_pred = model.predict(val_poly)
y_test_pred = model.predict(test_poly)

y_train_pred = (y_train_pred > 0.5).astype(int)
y_val_pred = (y_val_pred > 0.5).astype(int)
y_test_pred = (y_test_pred > 0.5).astype(int)

train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)

print("TRAIN:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"F1 Score: {train_f1:.2f}")
print(f"Precision: {train_precision:.2f}")
print(f"Recall: {train_recall:.2f}")

print("\nTEST:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"F1 Score: {test_f1:.2f}")
print(f"Precision: {test_precision:.2f}")
print(f"Recall: {test_recall:.2f}")

TRAIN:
Accuracy: 0.85
F1 Score: 0.78
Precision: 0.81
Recall: 0.75

TEST:
Accuracy: 0.57
F1 Score: 0.29
Precision: 0.36
Recall: 0.24


# THIRD

In [38]:
train = pd.read_parquet('train.parquet')
val = pd.read_parquet('val.parquet')
test = pd.read_parquet('test.parquet')

In [39]:
y_train = train[target]
train = train.drop(columns=[target])
y_val = val[target]
val = val.drop(columns=[target])
y_test = test[target]
test = test.drop(columns=[target])


train = pd.get_dummies(train)
val = pd.get_dummies(val)
test = pd.get_dummies(test)

train, val = train.align(val, join='left', axis=1, fill_value=0)
train, test = train.align(test, join='left', axis=1, fill_value=0)


poly = PolynomialFeatures(degree=1)
train_poly = poly.fit_transform(train)
val_poly = poly.transform(val)
test_poly = poly.transform(test)

pca = PCA(n_components=15)
train_pca = pca.fit_transform(train_poly)
val_pca = pca.transform(val_poly)
test_pca = pca.transform(test_poly)

LR = GradientBoostingClassifier(n_estimators = 20, max_depth = 6, learning_rate = 0.3).fit(train_pca, y_train)

In [42]:
y_train_pred = LR.predict(train_pca)
y_val_pred = LR.predict(val_pca)
y_test_pred = LR.predict(test_pca)

y_train_proba = LR.predict_proba(train_pca)[:, 1]
y_val_proba = LR.predict_proba(val_pca)[:, 1]
y_test_proba = LR.predict_proba(test_pca)[:, 1]

train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_roc_auc = roc_auc_score(y_train, y_train_proba)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_proba)

val_roc_auc = roc_auc_score(y_val, y_val_proba)

print("TRAIN METRICS:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"F1-Score: {train_f1:.2f}")
print(f"Recall: {train_recall:.2f}")
print(f"Precision: {train_precision:.2f}")
print(f"ROC-AUC: {train_roc_auc:.2f}")

print("\nTEST METRICS:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"F1-Score: {test_f1:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"Precision: {test_precision:.2f}")
print(f"ROC-AUC: {test_roc_auc:.2f}")

print("\nVAL METRICS:")
print(f"ROC-AUC: {val_roc_auc:.2f}")

TRAIN METRICS:
Accuracy: 0.79
F1-Score: 0.60
Recall: 0.43
Precision: 0.99
ROC-AUC: 0.91

TEST METRICS:
Accuracy: 0.61
F1-Score: 0.18
Recall: 0.12
Precision: 0.36
ROC-AUC: 0.49

VAL METRICS:
ROC-AUC: 0.51


# FOURTH