<a href="https://colab.research.google.com/github/Amiya-Kalita/Machine-Learning/blob/main/Heart_decess_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler , LabelEncoder , OneHotEncoder
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.metrics import accuracy_score , r2_score , confusion_matrix , roc_auc_score

In [3]:
df = pd.read_csv('/content/drive/MyDrive/DataSets/heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [6]:
df['RestingECG'].value_counts()

Unnamed: 0_level_0,count
RestingECG,Unnamed: 1_level_1
Normal,552
LVH,188
ST,178


In [7]:
df['ChestPainType'].value_counts()

Unnamed: 0_level_0,count
ChestPainType,Unnamed: 1_level_1
ASY,496
NAP,203
ATA,173
TA,46


In [8]:
df['Sex'].value_counts()

Unnamed: 0_level_0,count
Sex,Unnamed: 1_level_1
M,725
F,193


In [9]:
df['ST_Slope'].value_counts()

Unnamed: 0_level_0,count
ST_Slope,Unnamed: 1_level_1
Flat,460
Up,395
Down,63


In [13]:
df.isna().sum()

Unnamed: 0,0
Age,0
Sex,0
ChestPainType,0
RestingBP,0
Cholesterol,0
FastingBS,0
RestingECG,0
MaxHR,0
ExerciseAngina,0
Oldpeak,0


In [15]:
df.dtypes

Unnamed: 0,0
Age,int64
Sex,object
ChestPainType,object
RestingBP,int64
Cholesterol,int64
FastingBS,int64
RestingECG,object
MaxHR,int64
ExerciseAngina,object
Oldpeak,float64


In [16]:
categorical_cols = df.select_dtypes(include=['object']).columns
print("\nCategorical Columns Detected:", list(categorical_cols))


Categorical Columns Detected: ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


In [18]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [19]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [20]:
x = df.drop('HeartDisease', axis=1)
x.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,1,140,289,0,1,172,0,0.0,2
1,49,0,2,160,180,0,1,156,0,1.0,1
2,37,1,1,130,283,0,2,98,0,0.0,2
3,48,0,0,138,214,0,1,108,1,1.5,1
4,54,1,2,150,195,0,1,122,0,0.0,2


In [21]:
y = df['HeartDisease']
y.head()

Unnamed: 0,HeartDisease
0,0
1,1
2,0
3,1
4,0


In [22]:
x_train , x_test , y_train , y_test = train_test_split(x, y , random_state= 42 , test_size= 0.2 )

In [23]:
x_train.shape

(734, 11)

In [24]:
x_test.shape

(184, 11)

In [45]:
x_test[0]

array([ 0.99902384, -1.84978019,  0.25349836,  0.37641517, -0.04331227,
       -0.5427086 ,  0.01734236,  1.69174843, -0.83846064, -0.83628643,
        1.07985506])

In [25]:
y_train.shape

(734,)

In [26]:
y_test.shape

(184,)

In [27]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [29]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Support Vector Machine": SVC(kernel='rbf', probability=True),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

In [31]:
results = {}

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"\n {name} Accuracy: {acc*100:.2f}%")
    print(classification_report(y_test, y_pred))

results_df = pd.DataFrame(list(results.items()), columns=["Model", "Accuracy"])
results_df = results_df.sort_values(by="Accuracy", ascending=False)

print("\nModel Comparison Results:\n")
print(results_df)


 Logistic Regression Accuracy: 84.78%
              precision    recall  f1-score   support

           0       0.78      0.88      0.83        77
           1       0.91      0.82      0.86       107

    accuracy                           0.85       184
   macro avg       0.84      0.85      0.85       184
weighted avg       0.85      0.85      0.85       184


 K-Nearest Neighbors Accuracy: 84.78%
              precision    recall  f1-score   support

           0       0.78      0.88      0.83        77
           1       0.91      0.82      0.86       107

    accuracy                           0.85       184
   macro avg       0.84      0.85      0.85       184
weighted avg       0.85      0.85      0.85       184


 Support Vector Machine Accuracy: 86.41%
              precision    recall  f1-score   support

           0       0.82      0.86      0.84        77
           1       0.89      0.87      0.88       107

    accuracy                           0.86       184
   macro

In [32]:
import joblib
joblib.dump(models["Gradient Boosting"], "best_model.pkl")

['best_model.pkl']

In [33]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [35]:
rf = RandomForestClassifier(random_state=42)

In [36]:
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring='accuracy',
                           verbose=2)

In [38]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [39]:
grid_search.best_params_

{'criterion': 'gini',
 'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 200}

In [40]:
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(x_test, y_test)
print("\nTest Set Accuracy with Best Parameters: {:.2f}%".format(test_accuracy * 100))


Test Set Accuracy with Best Parameters: 88.59%


In [41]:
my_model =  RandomForestClassifier(min_samples_split=5 , n_estimators= 200 , random_state= 42)
my_model.fit(x_train , y_train)

In [42]:
predict =  my_model.predict(x_test)
predict

array([0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 1])

In [43]:
classification_report(y_test , predict)

'              precision    recall  f1-score   support\n\n           0       0.85      0.88      0.87        77\n           1       0.91      0.89      0.90       107\n\n    accuracy                           0.89       184\n   macro avg       0.88      0.89      0.88       184\nweighted avg       0.89      0.89      0.89       184\n'

In [48]:
data_pred = my_model.predict(np.array([[ 0.99902384, -1.84978019,  0.25349836,  0.37641517, -0.04331227,
       -0.5427086 ,  0.01734236,  1.69174843, -0.83846064, -0.83628643,
        1.07985506]]))

data_pred

array([0])

In [49]:
accuracy_score(y_test , predict)

0.8858695652173914

In [50]:
confusion_matrix(y_test , predict)

array([[68,  9],
       [12, 95]])

In [51]:
r2_score(y_test , predict)

0.5310110450297367

In [52]:
save_model = joblib.dump(my_model,"MY_MODEl.pkl")
print("Model saved sucessfully")

Model saved sucessfully
