In [None]:
#load Data
import pandas as pd
import numpy as np
import plotly.express as px

from google.colab import drive
drive.mount('/content/drive')




#show all columns
pd.set_option('display.max_columns', None)

df = pd.read_csv('/content/drive/MyDrive/Epsilon/Epsilon Final Project/heart_2022_with_nans.csv')


#show all columns
#pd.set_option('display.max_columns', None)

#df = pd.read_csv('heart_2022_with_nans.csv')

In [None]:
df

# Data Exploring

In [None]:
df.info()

In [None]:
round(df.describe(include='number'),2)

In [None]:
df.describe(include='O')

In [None]:
#drop column weight kilogram because BMI is already present
df = df.drop(columns=['WeightInKilograms','State','RaceEthnicityCategory','HadAsthma','HadSkinCancer'],axis=1)
df.drop_duplicates(inplace=True)


In [None]:
Num_columns = df.select_dtypes(include=['number']).columns
Num_columns

In [None]:
cat_columns = df.select_dtypes(include=['object']).columns

In [None]:
for col in cat_columns:
    print(f"{col}: {df[col].unique()}")
    print("*" * 20)

In [None]:
df

## In depth check

In [None]:
for col in Num_columns:

    px.histogram(data_frame= df, x= col).show()

# Data Preprocessing

In [None]:
#drop duplicates
df.drop_duplicates(inplace=True)

#get the nan values Percentage for number columns
df_num_sorted = df[Num_columns].isna().mean().sort_values(ascending=False) * 100

#get the nan values Percentage for categorical columns
df_cat_sorted = df[cat_columns].isna().mean().sort_values(ascending=False) * 100

#get number columns  have nan <5% and drop the rows
cols_to_dropna = df_num_sorted[df_num_sorted < 5].index.tolist()
cols_to_dropna+=df_cat_sorted[df_cat_sorted < 5].index.tolist()

df.dropna(subset=cols_to_dropna, inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
#get the remaining nan values Percentage for number columns
df_num_sorted = df[Num_columns].isna().mean().sort_values(ascending=False) * 100
nan_col_num =df_num_sorted[df_num_sorted>0].index
nan_col_num

In [None]:
#get the remaining nan values Percentage for categorical columns
df_cat_sorted = df[cat_columns].isna().mean().sort_values(ascending=False) * 100
nan_col_cat =df_cat_sorted[df_cat_sorted>0].index
nan_col_cat

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
#replace Never used e-cigarettes in my entire life,Not at all (right now) to =>Not at all
df["ECigaretteUsage"]=df["ECigaretteUsage"].replace({"Never used e-cigarettes in my entire life":"Not at all","Not at all (right now)":"Not at all"})

In [None]:
#check the percentage of data of each category in the categorical columns
for col in nan_col_cat:
    print(f"{col}: {df[col].value_counts()/df[col].shape[0]*100}")
    print("*" * 20)

In [None]:
#Check Encoder Type for each column
for col in cat_columns:
    print(f"{col}: {df[col].value_counts(normalize=True)*100}")
    print("*" * 20)

# Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [None]:
y= df["HadHeartAttack"].map({'Yes': 1, 'No': 0})
x = df.drop("HadHeartAttack", axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42,shuffle=True,stratify=y)
cat_columns = x_train.select_dtypes(include=['object']).columns
Num_columns = x_train.select_dtypes(include=['number']).columns

## Class weight calculation

In [None]:
df_count = y_train.value_counts()
df_count

In [None]:
negative = df_count[0]
positive = df_count[1]

### XGBoost

In [None]:
#Calculate the XGBoost_scale_pos_weight
XGBoost_scale_pos_weight = negative / positive
XGBoost_scale_pos_weight

### CatBoost

In [None]:
total =df_count[0] + df_count[1]
# compute inverse frequency weights
w0 = total / (2 * negative)
w1 = total / (2 * positive)

In [None]:
neg, pos = np.bincount(y_train)
neg,pos

In [None]:
cat_columns

In [None]:
Num_columns

## Categorical pipeline

In [None]:
#!pip install category_encoders

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from category_encoders import BinaryEncoder

FrqImputerOHEncoder_col=[col for col in cat_columns if col not in ['TetanusLast10Tdap','AgeCategory']]#'State',
FrqImputerOHEncoder_Pipeline = Pipeline(steps= [('CategoryPipeline_Freq_Imputer', SimpleImputer(strategy='most_frequent')),
                                  ('CategoryPipeline_OneHot_Encoder', OneHotEncoder(drop= 'first', sparse_output= False))])

ConstImputerOHEncoder_col=["TetanusLast10Tdap"]
ConstImputerOHEncoder_Pipeline= Pipeline(steps= [('CategoryPipeline_Constant_Imputer', SimpleImputer(strategy= 'constant', fill_value= 'unKnown')),
                                  ('CategoryPipeline_OneHot_Encoder', OneHotEncoder(drop= 'first', sparse_output= False))])

#Binary_Encoder_col=['State']
#Binary_Encoder_Pipeline = Pipeline(steps=[("BinaryEncoder",BinaryEncoder())])

Ordenal_Encoder_col=['AgeCategory']
Ordinal_Encoder_Pipeline = Pipeline(steps=[("OrdinalEncoder",OrdinalEncoder(categories= [ ['Age 18 to 24','Age 25 to 29','Age 30 to 34','Age 35 to 39','Age 40 to 44','Age 45 to 49','Age 50 to 54','Age 55 to 59','Age 60 to 64','Age 65 to 69','Age 70 to 74','Age 75 to 79','Age 80 or older'] ]))])


## numeric pipeline

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler

num_pipeline1 = Pipeline(steps= [ ('Simple Impute', SimpleImputer(strategy='median')),
                                   ('Robust Scaler', RobustScaler()) ])


## Preprocessing Pipeline

In [None]:
from sklearn.compose import ColumnTransformer


preprocessing = ColumnTransformer(transformers= [('num_pipeline1', num_pipeline1, Num_columns),
                                  ('FrqImputerOHEncoder_Pipeline', FrqImputerOHEncoder_Pipeline,FrqImputerOHEncoder_col),
                                  ('ConstImputerOHEncoder_Pipeline', ConstImputerOHEncoder_Pipeline,ConstImputerOHEncoder_col),
                                  #('Binary_Encoder_Pipeline', Binary_Encoder_Pipeline, Binary_Encoder_col),
                                  ('Ordinal_Encoder_Pipeline',Ordinal_Encoder_Pipeline,Ordenal_Encoder_col)
                                  ]
                                  ,remainder= 'passthrough')
preprocessing

## Models Pipeline

In [None]:
#!pip install catboost

In [None]:
#!pip install lightgbm

In [None]:
from sklearn.model_selection import cross_validate, RandomizedSearchCV,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
import plotly.express as px


models = [
    ('Logistic Regression', LogisticRegression(class_weight="balanced",random_state= 42, n_jobs= -1)),
   # ('KNN', KNeighborsClassifier(n_jobs= -1)),
    ('Gaussian NB', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier(class_weight="balanced",random_state= 42)),
    ('Random Forest', RandomForestClassifier(class_weight="balanced",random_state= 42, n_jobs= -1)),
    ('XGBoost', XGBClassifier(scale_pos_weight=XGBoost_scale_pos_weight,random_state=42,n_jobs=-1)),
    ('CatBoost', CatBoostClassifier(allow_writing_files=False,class_weights=[w0, w1], random_state=42,thread_count=-1)),# order must match class indices (0 = No, 1 = Yes)  ,thread_count =use all available CPU cores
    ('LightGBM', LGBMClassifier(class_weight='balanced',random_state=42, n_jobs=-1))
]

In [None]:


scoring = ['precision', 'recall', 'f1']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Store results
results = []
model_results = []
Test_results = []
for name,model in models:

  model_pipeline = Pipeline(steps=[('Preprocessing', preprocessing),('Model', model)])#

  scores  = cross_validate(model_pipeline, x_train, y_train, cv= cv, scoring= scoring, return_train_score= True, n_jobs= -1,error_score='raise')
  print(f"Model: {name}")
 # Take the mean for each score
  for metric in scoring:
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })

  print(pd.DataFrame(model_results))
  model_results=[]
  print("***********************************************************************")
     # Fit on full training data

  model_pipeline.fit(x_train, y_train)
  y_pred = model_pipeline.predict(x_test)
  report = classification_report(y_test, y_pred, output_dict=True)
  test_report_df = pd.DataFrame(report).transpose()
  print(test_report_df)
  print("***********************************************************************")
results_df = pd.DataFrame(results)

In [None]:
fig = px.bar(
    results_df,
    x='Model',
    y='Score',
    color='Set',
    facet_col='Metric',
    barmode='group',
    title='Model Comparison (Train vs Test Performance)',
    text='Score'
)

# Improve visuals
fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    yaxis=dict(range=[0, 1]),
    title_x=0.5,
    title_font_size=22,
    legend_title_text='Dataset'
)

fig.show()

## SMOT

In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
models = [
    ('Logistic Regression', LogisticRegression(random_state= 42, n_jobs= -1)),
   # ('KNN', KNeighborsClassifier(n_jobs= -1)),
    ('Gaussian NB', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier(random_state= 42)),
    ('Random Forest', RandomForestClassifier(random_state= 42, n_jobs= -1)),
    ('XGBoost', XGBClassifier(random_state=42,n_jobs=-1)),
   ('CatBoost', CatBoostClassifier(allow_writing_files=False, random_state=42,thread_count=-1)),# order must match class indices (0 = No, 1 = Yes)  ,thread_count =use all available CPU cores
   ('LightGBM', LGBMClassifier(random_state=42, n_jobs=-1))
]

In [None]:
scoring = ['precision', 'recall', 'f1']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Store results
results = []
for name,model in models:
  model_pipeline = ImbPipeline(steps=[('Preprocessing', preprocessing), ('SMOTE', smote),('Model', model)])#

  scores  = cross_validate(model_pipeline, x_train, y_train, cv= cv, scoring= scoring, return_train_score= True, n_jobs= -1,error_score='raise')
  print(f"Model: {name}")

  for metric in scoring:
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })
  print(pd.DataFrame(model_results))
  model_results=[]
  print("***********************************************************************")
     # Fit on full training data

  model_pipeline.fit(x_train, y_train)
  y_pred = model_pipeline.predict(x_test)
  report = classification_report(y_test, y_pred, output_dict=True)
  test_report_df = pd.DataFrame(report).transpose()
  print(test_report_df)
  print("***********************************************************************")
results_df = pd.DataFrame(results)

In [None]:
fig = px.bar(
    results_df,
    x='Model',
    y='Score',
    color='Set',
    facet_col='Metric',
    barmode='group',
    title='Model Comparison (Train vs Test Performance)',
    text='Score'
)

# Improve visuals
fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    yaxis=dict(range=[0, 1]),
    title_x=0.5,
    title_font_size=22,
    legend_title_text='Dataset'
)

fig.show()

## Hyper Parametr Randomized search

In [None]:
models = [
    ('Logistic Regression',
    LogisticRegression(class_weight="balanced",random_state= 42, n_jobs= -1),
    {"Model__C":[0.01, 0.1, 1, 10, 100],"Model__penalty":['l1', 'l2']}),

    #('Decision Tree',
     # DecisionTreeClassifier(class_weight="balanced",random_state= 42),
      #{"Model__max_depth": [3, 5, 7, 9,21,None]}),

    ('Random Forest',
      RandomForestClassifier(class_weight="balanced",random_state= 42, n_jobs= -1),
      {"Model__n_estimators": [2,3,5,10,50,100, 300, 500],"Model__max_depth": [2,5, 10, 15,21 ]}
    ),

     ('XGBoost',
     XGBClassifier(scale_pos_weight=XGBoost_scale_pos_weight,random_state=42, n_jobs=-1),
     { "Model__n_estimators": [2,3,5,10,50,100,500],"Model__max_depth": [3, 5, 10, 15,21],"Model__reg_lambda":[1,2,5,0.1,0.2,0.5]}
     ),
     #reg_alpha → L1,"Model__reg_alpha":[0.1,0.2,0.5,2,3]
     #reg_lambda → L2

    #('CatBoost',
     #CatBoostClassifier(allow_writing_files=False,class_weights=[w0, w1],random_state=42, thread_count=-1),
     #{"Model__depth": [4, 6, 8, 10,15,21]}),# order must match class indices w0,w1 (0 = No, 1 = Yes)  ,thread_count =use all available CPU cores

    ('LightGBM',
    LGBMClassifier(class_weight='balanced',random_state=42, n_jobs=-1),
    {"Model__n_estimators": [2,3,5,10,50,100,200, 400, 600]})
]

In [None]:
scoring = ['precision', 'recall', 'f1']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Store results
results = []
for name,model,HyperParameters in models:
  model_pipeline = ImbPipeline(steps=[('Preprocessing', preprocessing),('Model', model)])#


  search = RandomizedSearchCV(
        estimator=model_pipeline,
        param_distributions=HyperParameters,
        n_iter=5,
        scoring="recall",
        cv=cv,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )

  search.fit(x_train, y_train)

  print(f"Model: {name}")

  best_model = search.best_estimator_
  print(f"Best Parameters: {search.best_params_}")

  scores  = cross_validate(best_model, x_train, y_train, cv= cv, scoring= scoring, return_train_score= True, n_jobs= -1,error_score='raise')
  # Take the mean for each score
  for metric in scoring:
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })
  print(pd.DataFrame(model_results))
  model_results=[]
  print("***********************************************************************")
     # Fit on full training data

  best_model.fit(x_train, y_train)
  y_pred = best_model.predict(x_test)
  report = classification_report(y_test, y_pred, output_dict=True)
  test_report_df = pd.DataFrame(report).transpose()
  print(test_report_df)
  print("***********************************************************************")
results_df = pd.DataFrame(results)

In [None]:
fig = px.bar(
    results_df,
    x='Model',
    y='Score',
    color='Set',
    facet_col='Metric',
    barmode='group',
    title='Model Comparison (Train vs Test Performance)',
    text='Score'
)

# Improve visuals
fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    yaxis=dict(range=[0, 1]),
    title_x=0.5,
    title_font_size=22,
    legend_title_text='Dataset'
)

fig.show()

## Feature Selection

### VarianceThreshold
VarianceThreshold removes features that have little or no variability across samples.
If a feature is almost always the same (e.g., 99% of values = 1), it adds no predictive power — so we drop it.
This step is unsupervised (does not use y) and helps reduce noise and dataset dimensionality.

In [None]:
from sklearn.feature_selection import VarianceThreshold

# threshold=0 means remove features with same value in all rows
# threshold=0.01 removes those with <1% variance
feature_selector = VarianceThreshold(threshold=0)
models = [
    ('Logistic Regression', LogisticRegression(class_weight="balanced",random_state= 42, n_jobs= -1)),
   # ('KNN', KNeighborsClassifier(n_jobs= -1)),
    ('Gaussian NB', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier(class_weight="balanced",random_state= 42)),
    ('Random Forest', RandomForestClassifier(class_weight="balanced",random_state= 42, n_jobs= -1)),
    ('XGBoost', XGBClassifier(scale_pos_weight=XGBoost_scale_pos_weight,random_state=42,n_jobs=-1)),
   ('CatBoost', CatBoostClassifier(allow_writing_files=False,class_weights=[w0, w1], random_state=42,thread_count=-1)),# order must match class indices (0 = No, 1 = Yes)  ,thread_count =use all available CPU cores
   ('LightGBM', LGBMClassifier(class_weight='balanced',random_state=42, n_jobs=-1))
]

In [None]:


scoring = ['precision', 'recall', 'f1']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Store results
results = []
model_results = []
Test_results = []
for name,model in models:

  model_pipeline = Pipeline(steps=[('Preprocessing', preprocessing),('variance_filter', feature_selector),('Model', model)])#

  scores  = cross_validate(model_pipeline, x_train, y_train, cv= cv, scoring= scoring, return_train_score= True, n_jobs= -1,error_score='raise')
  print(f"Model: {name}")
 # Take the mean for each score
  for metric in scoring:
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })

  print(pd.DataFrame(model_results))
  model_results=[]
  print("***********************************************************************")
     # Fit on full training data

  model_pipeline.fit(x_train, y_train)
  y_pred = model_pipeline.predict(x_test)
  report = classification_report(y_test, y_pred, output_dict=True)
  test_report_df = pd.DataFrame(report).transpose()
  print(test_report_df)
  print("***********************************************************************")
results_df = pd.DataFrame(results)

In [None]:
fig = px.bar(
    results_df,
    x='Model',
    y='Score',
    color='Set',
    facet_col='Metric',
    barmode='group',
    title='Model Comparison (Train vs Test Performance)',
    text='Score'
)

# Improve visuals
fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    yaxis=dict(range=[0, 1]),
    title_x=0.5,
    title_font_size=22,
    legend_title_text='Dataset'
)

fig.show()

### KBSelect

In [None]:
from sklearn.feature_selection import SelectKBest,mutual_info_classif


models = [
    ('Logistic Regression', LogisticRegression(class_weight="balanced",random_state= 42, n_jobs= -1)),
   # ('KNN', KNeighborsClassifier(n_jobs= -1)),
    #('Gaussian NB', GaussianNB()),
    #('Decision Tree', DecisionTreeClassifier(class_weight="balanced",random_state= 42)),
    #('Random Forest', RandomForestClassifier(class_weight="balanced",random_state= 42, n_jobs= -1)),
    #('XGBoost', XGBClassifier(scale_pos_weight=XGBoost_scale_pos_weight,random_state=42,n_jobs=-1)),
    #('CatBoost', CatBoostClassifier(allow_writing_files=False,class_weights=[w0, w1], random_state=42,thread_count=-1)),# order must match class indices (0 = No, 1 = Yes)  ,thread_count =use all available CPU cores
    ('LightGBM', LGBMClassifier(class_weight='balanced',random_state=42, n_jobs=-1))
]

In [None]:

scoring = ['precision', 'recall', 'f1']
HyperParameters = {'kbest__k': [10, 20, 30]}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Store results
results = []
model_results = []
Test_results = []
for name,model in models:

  model_pipeline = Pipeline(steps=[('Preprocessing', preprocessing),('kbest', SelectKBest(score_func=mutual_info_classif)),('Model', model)])#

  search = RandomizedSearchCV(
        estimator=model_pipeline,
        param_distributions=HyperParameters,
        n_iter=5,
        scoring="recall",
        cv=cv,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )

  search.fit(x_train, y_train)

  print(f"Model: {name}")

  best_model = search.best_estimator_
  print(f"Best Parameters: {search.best_params_}")

  scores  = cross_validate(best_model, x_train, y_train, cv= cv, scoring= scoring, return_train_score= True, n_jobs= -1,error_score='raise')
  # Take the mean for each score
  for metric in scoring:
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })

  print(pd.DataFrame(model_results))
  model_results=[]
  print("***********************************************************************")
     # Fit on full training data

  best_model.fit(x_train, y_train)
  y_pred = best_model.predict(x_test)
  report = classification_report(y_test, y_pred, output_dict=True)
  test_report_df = pd.DataFrame(report).transpose()
  print(test_report_df)
  print("***********************************************************************")
results_df = pd.DataFrame(results)

In [None]:
fig = px.bar(
    results_df,
    x='Model',
    y='Score',
    color='Set',
    facet_col='Metric',
    barmode='group',
    title='Model Comparison (Train vs Test Performance)',
    text='Score'
)

# Improve visuals
fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    yaxis=dict(range=[0, 1]),
    title_x=0.5,
    title_font_size=22,
    legend_title_text='Dataset'
)

fig.show()

### Embedded methods
- SelectFromModel
- parameters:
- threshold='mean' → keep features with importance > average
- threshold='median' → keep half of the features
- threshold=0.02 → keep all features with importance > 0.02
- max_features=10 → keep only top 10 features (ignores threshold)

In [None]:
from sklearn.feature_selection import SelectFromModel


models = [
    ('Logistic Regression', LogisticRegression(class_weight="balanced",random_state= 42, n_jobs= -1)),

    ('Decision Tree', DecisionTreeClassifier(class_weight="balanced",random_state= 42)),
    ('Random Forest', RandomForestClassifier(class_weight="balanced",random_state= 42, n_jobs= -1)),
    ('XGBoost', XGBClassifier(scale_pos_weight=XGBoost_scale_pos_weight,random_state=42,n_jobs=-1)),
    ('CatBoost', CatBoostClassifier(allow_writing_files=False,class_weights=[w0, w1], random_state=42,thread_count=-1)),# order must match class indices (0 = No, 1 = Yes)  ,thread_count =use all available CPU cores
    ('LightGBM', LGBMClassifier(class_weight='balanced',random_state=42, n_jobs=-1))
]




In [None]:


scoring = ['precision', 'recall', 'f1']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Store results
results = []
model_results = []
Test_results = []
for name,model in models:

  model_pipeline = Pipeline(steps=[('Preprocessing', preprocessing),("FeatureSelector",SelectFromModel(model, max_features=20)),('Model', model)])#

  scores  = cross_validate(model_pipeline, x_train, y_train, cv= cv, scoring= scoring, return_train_score= True, n_jobs= -1,error_score='raise')
  print(f"Model: {name}")
 # Take the mean for each score
  for metric in scoring:
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Train',
            'Score': scores[f'train_{metric}'].mean()
        })
        model_results.append({
            'Model': name,
            'Metric': metric,
            'Set': 'Validation',
            'Score': scores[f'test_{metric}'].mean()
        })

  print(pd.DataFrame(model_results))
  model_results=[]
  print("***********************************************************************")
     # Fit on full training data

  model_pipeline.fit(x_train, y_train)
  y_pred = model_pipeline.predict(x_test)
  report = classification_report(y_test, y_pred, output_dict=True)
  test_report_df = pd.DataFrame(report).transpose()
  print(test_report_df)
  print("***********************************************************************")
results_df = pd.DataFrame(results)

In [None]:
fig = px.bar(
    results_df,
    x='Model',
    y='Score',
    color='Set',
    facet_col='Metric',
    barmode='group',
    title='Model Comparison (Train vs Test Performance)',
    text='Score'
)

# Improve visuals
fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    yaxis=dict(range=[0, 1]),
    title_x=0.5,
    title_font_size=22,
    legend_title_text='Dataset'
)

fig.show()

# Best Model
Light GPM
- validation Recall 0.78
- test Recall 0.81017


In [None]:
models = [

    ('LightGBM', LGBMClassifier(class_weight='balanced',random_state=42, n_jobs=-1))
]
name, model = models[0]
model_pipeline = Pipeline(steps=[('Preprocessing', preprocessing),('Model', model)])#




model_pipeline.fit(x_train, y_train)
y_pred = model_pipeline.predict(x_test)
report = classification_report(y_test, y_pred, output_dict=True)
test_report_df = pd.DataFrame(report).transpose()
print(test_report_df)



In [None]:
import joblib
joblib.dump(model_pipeline, "LightGPM.pkl")


# Streamlet

In [None]:
#!pip install streamlit

In [None]:
import streamlit as st


In [23]:
%%writefile HeartAttack_Streamlet.py
import streamlit as st
import pandas as pd
import joblib
from imblearn.pipeline import  Pipeline
from sklearn.impute import  SimpleImputer, KNNImputer
from sklearn.preprocessing import  RobustScaler, OneHotEncoder, OrdinalEncoder
from category_encoders import  BinaryEncoder
from imblearn.over_sampling import  SMOTE
from sklearn.neighbors import KNeighborsClassifier

st.set_page_config(layout= 'wide', page_title= 'Heart Attack Prediction Project')
html_title = "<h1 style=color:white;text-align:center;> Heart Attack Prediction Project </h1>"
st.markdown(html_title, unsafe_allow_html=True)

df = pd.read_csv('HeartAttack_cleaned_df.csv')
df.dropna(inplace=True)
st.dataframe(df)

Sex = st.sidebar.radio('Sex', df.Sex.unique())
PhysicalHealthDays= st.sidebar.slider('Physical Health Days', int(df.PhysicalHealthDays.min()), int(df.PhysicalHealthDays.max()), int(df.PhysicalHealthDays.mean()))
MentalHealthDays= st.sidebar.slider('Mental Health Days', int(df.MentalHealthDays.min()), int(df.MentalHealthDays.max()), int(df.MentalHealthDays.mean()))
SleepHours= st.sidebar.slider('Sleep Hours', float(df.SleepHours.min()), float(df.SleepHours.max()), float(df.SleepHours.mean()))
HeightInMeters= st.sidebar.slider('Height In Meters', float(df.HeightInMeters.min()), float(df.HeightInMeters.max()), float(df.HeightInMeters.mean()))
BMI= st.sidebar.slider('BMI', float(df.BMI.min()), float(df.BMI.max()), float(df.BMI.mean()))
GeneralHealth = st.sidebar.selectbox('PLease provid General Health ', df.GeneralHealth.unique())
LastCheckupTime = st.sidebar.selectbox('When was your last checkup?', df.LastCheckupTime.unique())
PhysicalActivities = st.sidebar.radio('Do you do physical activities?', df.PhysicalActivities.unique())
RemovedTeeth= st.sidebar.selectbox('Removed Teeth',df.RemovedTeeth.unique())
HadAngina= st.sidebar.radio('Had Angina',df.HadAngina.unique())
HadStroke= st.sidebar.radio('Had Stroke',df.HadStroke.unique())
HadCOPD= st.sidebar.radio('Had COPD',df.HadCOPD.unique())
HadDepressiveDisorder= st.sidebar.radio('Had Depressive Disorder',df.HadDepressiveDisorder.unique())
HadKidneyDisease= st.sidebar.radio('Had Kidney Disease',df.HadKidneyDisease.unique())
HadArthritis= st.sidebar.radio('Had Arthritis',df.HadArthritis.unique())
HadDiabetes= st.sidebar.selectbox('Had Diabetes',df.HadDiabetes.unique())
DeafOrHardOfHearing= st.sidebar.radio('Deaf Or Hard Of Hearing',df.DeafOrHardOfHearing.unique())
BlindOrVisionDifficulty= st.sidebar.radio('Blind Or Vision Difficulty',df.BlindOrVisionDifficulty.unique())
DifficultyConcentrating= st.sidebar.radio('Difficulty Concentrating',df.DifficultyConcentrating.unique())
DifficultyWalking= st.sidebar.radio('Difficulty Walking',df.DifficultyWalking.unique())
DifficultyDressingBathing= st.sidebar.radio('Difficulty Dressing Bathing',df.DifficultyDressingBathing.unique())
DifficultyErrands= st.sidebar.radio('Difficulty Errands',df.DifficultyErrands.unique())
SmokerStatus= st.sidebar.selectbox('Smoker Status',df.SmokerStatus.unique())
ECigaretteUsage= st.sidebar.selectbox('E-Cigarette Usage',df.ECigaretteUsage.unique())
ChestScan= st.sidebar.radio('Chest Scan',df.ChestScan.unique())
AgeCategory= st.sidebar.selectbox('Age Category',df.AgeCategory.unique())
AlcoholDrinkers= st.sidebar.radio('Alcohol Drinkers',df.AlcoholDrinkers.unique())
HIVTesting= st.sidebar.radio('HIV Testing',df.HIVTesting.unique())
FluVaxLast12= st.sidebar.radio('Flu Vax Last 12 Months',df.FluVaxLast12.unique())
PneumoVaxEver= st.sidebar.radio('Pneumo Vax Ever',df.PneumoVaxEver.unique())
TetanusLast10Tdap= st.sidebar.selectbox('Tetanus Last 10 Tdap',df.TetanusLast10Tdap.unique())
HighRiskLastYear= st.sidebar.radio('High Risk Last Year',df.HighRiskLastYear.unique())
CovidPos= st.sidebar.selectbox('Covid Positive',df.CovidPos.unique())

# Import Model
Model = joblib.load('LightGPM.pkl')

input_cols = df.columns.drop('HadHeartAttack')
input_data = pd.DataFrame(columns=input_cols,data= [ [Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime
,PhysicalActivities,SleepHours,RemovedTeeth,HadAngina,HadStroke,HadCOPD,HadDepressiveDisorder,HadKidneyDisease,HadArthritis
,HadDiabetes,DeafOrHardOfHearing,BlindOrVisionDifficulty,DifficultyConcentrating,DifficultyWalking,DifficultyDressingBathing
,DifficultyErrands,SmokerStatus,ECigaretteUsage,ChestScan,AgeCategory,HeightInMeters,BMI,AlcoholDrinkers
,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos] ])

if st.button('Predict'):

    result = Model.predict(input_data)[0]

    if result == 0:
        st.write('Heart Attack : NO')

    else:
        st.write('Heart Attack : YES')




Overwriting HeartAttack_Streamlet.py


In [None]:
! streamlit run HeartAttack_Streamlet.py

In [None]:
#!pip install pipreqs

In [26]:
import pipreqs

! pipreqs .

INFO: Not scanning for jupyter notebooks.
Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
INFO: Successfully saved requirements file in .\requirements.txt
