In [37]:
import pandas as pd
import numpy as np

In [39]:
train = pd.read_csv("Heart_Attack_training_dataset.csv")

In [41]:
test = pd.read_csv("Hear_Attack_evaluation_dataset.csv")

In [43]:
train[['bp_sys', 'bp_dia']] = train['bp'].str.split('/', expand=True).astype(float)

In [45]:
test[['bp_sys', 'bp_dia']] = test['bp'].str.split('/', expand=True).astype(float)

In [47]:
for df in [train, test]:
    df['pulse_pressure'] = df['bp_sys'] - df['bp_dia']
    df['MAP'] = df['bp_dia'] + df['pulse_pressure'] / 3

In [49]:
binary_cols = ['diabetes', 'family_history', 'smoking', 
               'obesity', 'alcohol', 'prev_heart_prob', 'med_use']

In [51]:
for df in [train, test]:
    for col in binary_cols:
      
        df[col] = df[col].fillna(0)
        
        
        df[col] = df[col].map({'Yes': 1, 'No': 0, 
                               'Y': 1, 'N': 0, 
                               'True': 1, 'False': 0, 
                               1: 1, 0: 0})  
        
        
        df[col] = df[col].astype(int)

In [53]:
train[binary_cols].head()

Unnamed: 0,diabetes,family_history,smoking,obesity,alcohol,prev_heart_prob,med_use
0,0,0,1,0,0,0,0
1,1,1,1,1,1,1,0
2,1,0,0,0,0,1,1
3,1,1,1,0,1,1,0
4,1,1,1,1,0,1,0


In [55]:
test[binary_cols].head()

Unnamed: 0,diabetes,family_history,smoking,obesity,alcohol,prev_heart_prob,med_use
0,1,1,1,0,0,0,0
1,0,0,1,1,0,1,1
2,0,0,1,0,0,1,0
3,1,0,1,1,0,0,0
4,1,1,1,0,0,0,0


In [57]:
sex_map = {'Male': 1, 'Female': 0}
for df in [train, test]:
    df['sex'] = df['sex'].map(sex_map).astype(int)

In [59]:
print("Train diet values:", train['diet'].unique())

Train diet values: ['Average' 'Unhealthy' 'Healthy']


In [61]:
print("Test diet values:", test['diet'].unique())


Test diet values: ['Healthy' 'Unhealthy' 'Average']


In [63]:
diet_map = {'Poor': 0, 'Average': 1, 'Healthy': 2} 

In [65]:
for df in [train, test]:
    df['diet'] = df['diet'].map(diet_map)
    df['diet'] = df['diet'].fillna(-1).astype(int)

In [67]:
print("Class distribution in train dataset:")
print(train['heart_attack_risk'].value_counts())

Class distribution in train dataset:
heart_attack_risk
0    5224
1    2739
Name: count, dtype: int64


In [69]:
print("\nClass percentages:")
print(train['heart_attack_risk'].value_counts(normalize=True) * 100)


Class percentages:
heart_attack_risk
0    65.603416
1    34.396584
Name: proportion, dtype: float64


In [71]:
from sklearn.impute import SimpleImputer

In [73]:
num_cols = train.select_dtypes(include=['float64', 'int64']).columns.tolist()
num_cols.remove('heart_attack_risk') 

In [75]:
median_imputer = SimpleImputer(strategy='median')


In [77]:
train[num_cols] = median_imputer.fit_transform(train[num_cols])
test[num_cols] = median_imputer.transform(test[num_cols])

In [79]:
cat_cols = train.select_dtypes(include=['object']).columns.tolist()

In [81]:
for col in cat_cols:
   
    mode_value = train[col].mode()[0]  
    train[col] = train[col].fillna(mode_value)
    test[col] = test[col].fillna(mode_value) 

In [83]:
top_k = 10
top_countries = train['country'].value_counts().nlargest(top_k).index.tolist()

In [85]:
def encode_country(df, top_countries):
    df['country_enc'] = df['country'].apply(lambda x: x if x in top_countries else 'Other')
    # Frequency encoding
    freq = df['country_enc'].value_counts() / len(df)
    df['country_enc'] = df['country_enc'].map(freq)
    return df

In [87]:
train = encode_country(train, top_countries)
test = encode_country(test, top_countries)


In [89]:
train = train.drop(columns=['country'])
test = test.drop(columns=['country'])

In [91]:
bins = [0, 40, 60, np.inf]
labels = ['<40', '40-60', '60+']
for df in [train, test]:
    df['age_bucket'] = pd.cut(df['age'], bins=bins, labels=labels)

In [93]:
age_map = {'<40': 0, '40-60': 1, '60+': 2}
for df in [train, test]:
    df['age_bucket'] = df['age_bucket'].map(age_map)

In [95]:
for df in [train, test]:
    df['age_bmi'] = df['age'] * df['bmi']


In [97]:
def bmi_category(bmi):
    if bmi < 18.5:
        return 0  
    elif bmi < 25:
        return 1 
    elif bmi < 30:
        return 2  
    else:
        return 3 


In [99]:
for df in [train, test]:
    df['bmi_cat'] = df['bmi'].apply(bmi_category)

In [101]:
for df in [train, test]:
    df['chol_bin'] = df['chol'].apply(lambda x: 1 if x > 200 else 0) 
    df['trig_bin'] = df['triglycerides'].apply(lambda x: 1 if x > 150 else 0) 

In [103]:
for df in [train, test]:
    df['exercise_per_day'] = df['exercise_hr_wk'] / (df['phys_act_days'] + 1e-6)

In [105]:
for df in [train, test]:
    df['sedentary_sleep_ratio'] = df['sedentary_hr'] / (df['sleep_hr'] + 1e-6)


In [107]:
for df in [train, test]:
    df['income_log'] = np.log1p(df['income'])

In [109]:
cont_mean_target = train.groupby('continent')['heart_attack_risk'].mean().to_dict()
for df in [train, test]:
    df['continent_te'] = df['continent'].map(cont_mean_target)


In [111]:
train = train.drop(columns=['continent'])
test = test.drop(columns=['continent'])

In [113]:
!pip install scikit-learn imbalanced-learn category_encoders




In [114]:
import sklearn

In [115]:
print(sklearn.__version__)

1.7.2


In [119]:
!pip install --upgrade scikit-learn




In [120]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [121]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [122]:
import sklearn
import category_encoders as ce

print("sklearn:", sklearn.__version__)


sklearn: 1.7.2


In [127]:
!pip install imbalanced-learn




In [128]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier


In [129]:
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [133]:
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [135]:
target_encoder = ce.TargetEncoder()

In [137]:
numeric_features = ['age', 'bmi', 'chol', 'triglycerides', 'exercise_hr_wk', 
                    'phys_act_days', 'sleep_hr', 'sedentary_hr', 'pulse_pressure', 
                    'MAP', 'age_bmi', 'exercise_per_day', 'sedentary_sleep_ratio', 
                    'income_log']

In [139]:
low_card_cat_features = ['sex', 'diet', 'bmi_cat', 'age_bucket']  # one-hot

high_card_cat_features = ['continent']

In [141]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numeric_features),
    ('cat_low', categorical_pipeline, low_card_cat_features)
    # Note: high-card features like 'continent' can be target-encoded inside CV only
])

In [143]:
clf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')

pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),  # applies only on training fold
    ('classifier', clf)
])

In [145]:
from sklearn.model_selection import cross_val_score

X = train.drop(columns=['heart_attack_risk'])
y = train['heart_attack_risk']

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=cv, scoring='recall')
print("Recall scores per fold:", scores)
print("Mean recall:", scores.mean())

Recall scores per fold: [0.10766423 0.12956204 0.10218978 0.10786106 0.13321168]
Mean recall: 0.116097759511069


In [147]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

In [149]:
target = 'heart_attack_risk'

# Define StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [151]:
logreg_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),  # only applied to training folds
    ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))
])

In [153]:
rf_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(class_weight='balanced', n_estimators=200, random_state=42))
])


In [155]:
X = train.drop(columns=[target])
y = train[target]

# Evaluate Logistic Regression with recall
logreg_scores = cross_val_score(logreg_pipeline, X, y, cv=skf, scoring='recall')
print("Logistic Regression Recall Scores:", logreg_scores)
print("Mean Recall:", logreg_scores.mean())


Logistic Regression Recall Scores: [0.48905109 0.51824818 0.54014599 0.50457038 0.52737226]
Mean Recall: 0.5158775804320848


In [159]:
rf_scores = cross_val_score(rf_pipeline, X, y, cv=skf, scoring='recall')
print("Random Forest Recall Scores:", rf_scores)
print("Mean Recall:", rf_scores.mean())

Random Forest Recall Scores: [0.10766423 0.12956204 0.10218978 0.10786106 0.13321168]
Mean Recall: 0.116097759511069


In [163]:
!pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.5 MB 330.3 kB/s eta 0:00:05
   - -------------------------------------- 0.1/1.5 MB 563.7 kB/s eta 0:00:03
   -- ------------------------------------- 0.1/1.5 MB 590.8 kB/s eta 0:00:03
   ----- ---------------------------------- 0.2/1.5 MB 798.5 kB/s eta 0:00:02
   ------ --------------------------------- 0.2/1.5 MB 801.7 kB/s eta 0:00:02
   -------- ------------------------------- 0.3/1.5 MB 936.6 kB/s eta 0:00:02
   ---------- ----------------------------- 0.4/1.5 MB 998.3 kB/s eta 0:00:02
   ----------- ---------------------------- 0.4/1.5 MB 1.0 MB/s eta 0:00:02
   -------------- ------------------------- 0.5/1.5 MB 1.1 MB/s eta 0:00:01
   -------------

In [165]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, recall_score, f1_score, roc_auc_score
from lightgbm import LGBMClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE


In [167]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# LightGBM pipeline with preprocessing + SMOTE
lgbm_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LGBMClassifier(random_state=42))
])


In [169]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300, 400],
    'classifier__max_depth': [3, 5, 7, 10, -1],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__num_leaves': [20, 31, 50, 100],
    'classifier__min_data_in_leaf': [10, 20, 50, 100],
    'classifier__scale_pos_weight': [1, (y==0).sum()/(y==1).sum()]  # handle imbalance
}


In [171]:
scoring = {
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': 'roc_auc'
}

In [173]:
random_search = RandomizedSearchCV(
    estimator=lgbm_pipeline,
    param_distributions=param_grid,
    n_iter=50,  # number of random combinations
    scoring=scoring,
    refit='recall',  # pick the best model based on recall
    cv=skf,
    verbose=2,
    n_jobs=-1,
    random_state=42
)


In [178]:
random_search.fit(X, y)

# Best model
print("Best Parameters:", random_search.best_params_)
print("Best Recall Score:", random_search.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[LightGBM] [Info] Number of positive: 5224, number of negative: 5224
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001090 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5418
[LightGBM] [Info] Number of data points in the train set: 10448, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best Parameters: {'classifier__scale_pos_weight': 1.9072654253377146, 'classifier__num_leaves': 20, 'classifier__n_estimators': 200, 'classifier__min_data_in_leaf': 50, 'classifier__max_depth': 3, 'classifier__learning_rate': 0.01}
Best Recall Score: 0.984662859125422


In [176]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X)

from sklearn.metrics import classification_report
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.01      0.03      5224
           1       0.35      1.00      0.51      2739

    accuracy                           0.35      7963
   macro avg       0.63      0.51      0.27      7963
weighted avg       0.71      0.35      0.20      7963





In [177]:
from sklearn.metrics import recall_score

In [183]:
from sklearn.ensemble import RandomForestClassifier

In [191]:
target = 'heart_attack_risk'
X = train.drop(columns=[target])
y = train[target]


In [193]:
# Identify categorical and numeric features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [195]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


In [197]:
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [199]:
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

In [201]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

In [203]:
X_train = train.drop(columns=['heart_attack_risk'])  # features
y_train = train['heart_attack_risk']                 # target
X_test = test.copy()                                 # test features (no target)


In [205]:
val_fraction = 0.2
val_size = int(len(X_train) * val_fraction)
X_val = X_train.iloc[:val_size]
y_val = y_train.iloc[:val_size]
X_train_rest = X_train.iloc[val_size:]
y_train_rest = y_train.iloc[val_size:]

In [207]:
model_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [227]:
proba = model_pipeline.predict_proba(X_val)[:, 1]
print(proba[:20])

[0.11 0.11 0.09 0.17 0.11 0.76 0.73 0.12 0.06 0.16 0.74 0.68 0.1  0.79
 0.15 0.13 0.73 0.11 0.15 0.69]


In [229]:
thresholds = np.arange(0.0, 1.01, 0.01)


best_thr = 0.5
best_recall = 0


In [231]:
for thr in thresholds:
    preds = (proba > thr).astype(int)
    rec = recall_score(y_val, preds)
    if rec > best_recall:
        best_recall = rec
        best_thr = thr

print(f"Best threshold: {best_thr}")
print(f"Recall at best threshold: {best_recall}")

Best threshold: 0.0
Recall at best threshold: 1.0


In [233]:
test_proba = model_pipeline.predict_proba(test)[:, 1]
final_preds = (test_proba > best_thr).astype(int)

In [235]:
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'heart_attack_risk': final_preds
})

In [237]:
submission.to_csv("EM24_Astra_Task1_Predictions.csv", index=False)
print("Submission CSV saved.")

Submission CSV saved.


In [239]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Compute predictions using best threshold
val_preds = (proba > best_thr).astype(int)

# Compute metrics
accuracy = accuracy_score(y_val, val_preds)
precision = precision_score(y_val, val_preds)
recall = recall_score(y_val, val_preds)
f1 = f1_score(y_val, val_preds)
roc_auc = roc_auc_score(y_val, proba)

# Display nicely
print("Validation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"ROC-AUC  : {roc_auc:.4f}")


Validation Metrics:
Accuracy : 0.3204
Precision: 0.3204
Recall   : 1.0000
F1-score : 0.4853
ROC-AUC  : 1.0000


In [241]:
import pandas as pd

metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-score", "ROC-AUC"],
    "Value": [accuracy, precision, recall, f1, roc_auc]
})

metrics_df


Unnamed: 0,Metric,Value
0,Accuracy,0.320352
1,Precision,0.320352
2,Recall,1.0
3,F1-score,0.485252
4,ROC-AUC,1.0


In [245]:
!pip install matplotlib


Collecting matplotlib
  Downloading matplotlib-3.10.6-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.60.1-cp312-cp312-win_amd64.whl.metadata (114 kB)
     ---------------------------------------- 0.0/114.6 kB ? eta -:--:--
     --- ------------------------------------ 10.2/114.6 kB ? eta -:--:--
     --------- --------------------------- 30.7/114.6 kB 660.6 kB/s eta 0:00:01
     ----------------------------- ------- 92.2/114.6 kB 751.6 kB/s eta 0:00:01
     ------------------------------------ 114.6/114.6 kB 739.9 kB/s eta 0:00:00
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp312-cp312-win_amd64.whl.metadata (6.4 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.6-cp312-cp312-win_amd64.whl 

In [243]:
import matplotlib.pyplot as plt

# Create a figure and hide axes
fig, ax = plt.subplots(figsize=(6, 2))
ax.axis('off')

# Add metrics text
metrics_text = (
    f"Accuracy : {accuracy:.4f}\n"
    f"Precision: {precision:.4f}\n"
    f"Recall   : {recall:.4f}\n"
    f"F1-score : {f1:.4f}\n"
    f"ROC-AUC  : {roc_auc:.4f}"
)

ax.text(0.5, 0.5, metrics_text, fontsize=14, ha='center', va='center')
plt.tight_layout()
plt.savefig("EM24_Astra_Task1_Metrics.png")
plt.show()


ModuleNotFoundError: No module named 'matplotlib'