<a href="https://colab.research.google.com/github/AbhishekNishad02/OIBSIB/blob/main/My_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***1. Import libraries***

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import pickle

***2. Load data***

In [None]:
df=pd.read_csv('heartfailure.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [None]:
print(df['HeartDisease'].value_counts())
#print(df['HeartDisease'].value_counts(normalize=True) * 100)

HeartDisease
1    508
0    410
Name: count, dtype: int64


In [None]:
df.isnull().sum()

Unnamed: 0,0
Age,0
Sex,0
ChestPainType,0
RestingBP,0
Cholesterol,0
FastingBS,0
RestingECG,0
MaxHR,0
ExerciseAngina,0
Oldpeak,0


***3.Define faluire type***

In [None]:
categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


In [None]:
numeric_features = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'FastingBS']

***4. PREPROCESSING PIPELINES***

In [None]:
numeric_transformer = Pipeline(steps=[ ('scaler', StandardScaler())])

In [None]:

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

***Model Pipeline***

In [None]:
clf = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

***6. Train/Test split***

In [None]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("\nðŸ“Œ Train/Test Split Complete\n")


ðŸ“Œ Train/Test Split Complete



***7. Hyperparameter tuning***

In [None]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

In [None]:
grid = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train, y_train)

In [None]:
print("Best params:", grid.best_params_)

Best params: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 2, 'classifier__n_estimators': 100}


***8. Evaluate***

In [None]:
y_pred = grid.predict(X_test)
y_proba = grid.predict_proba(X_test)[:, 1]

print("\nðŸ“Œ Classification Report:")
print(classification_report(y_test, y_pred))

print("\nðŸ“Œ ROC-AUC Score:", roc_auc_score(y_test, y_proba))

print("\nðŸ“Œ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


ðŸ“Œ Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.84      0.87        82
           1       0.88      0.92      0.90       102

    accuracy                           0.89       184
   macro avg       0.89      0.88      0.88       184
weighted avg       0.89      0.89      0.89       184


ðŸ“Œ ROC-AUC Score: 0.9336441893830703

ðŸ“Œ Confusion Matrix:
[[69 13]
 [ 8 94]]


***9. Feature importance ***

In [None]:
best_rf = grid.best_estimator_.named_steps['classifier']

# Get final feature names from ColumnTransformer
final_feature_names = (
    numeric_features +
    list(grid.best_estimator_.named_steps['preprocess']
         .named_transformers_['cat']
         .named_steps['onehot']
         .get_feature_names_out(categorical_features))
)

importances = best_rf.feature_importances_

print("\nðŸ“Œ Feature Importances:")
for name, imp in sorted(zip(final_feature_names, importances), key=lambda x: x[1], reverse=True):
    print(f"{name}: {imp:.4f}")


ðŸ“Œ Feature Importances:
ST_Slope_Up: 0.1960
ST_Slope_Flat: 0.1069
ChestPainType_ASY: 0.0888
Oldpeak: 0.0836
Cholesterol: 0.0830
MaxHR: 0.0775
ExerciseAngina_N: 0.0752
ExerciseAngina_Y: 0.0590
Age: 0.0537
RestingBP: 0.0501
Sex_F: 0.0218
FastingBS: 0.0212
Sex_M: 0.0185
ChestPainType_ATA: 0.0173
ChestPainType_NAP: 0.0111
RestingECG_Normal: 0.0087
RestingECG_LVH: 0.0087
ST_Slope_Down: 0.0085
RestingECG_ST: 0.0061
ChestPainType_TA: 0.0044


***10. Save model***

In [None]:
pickle.dump(grid.best_estimator_, open('/content/heart_failure_model.pkl', 'wb'))
print("\nðŸŽ‰ Model saved as: /content/heart_failure_model.pkl")


ðŸŽ‰ Model saved as: /content/heart_failure_model.pkl
