In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv(r"/Users/sarathyv/Downloads/Heart_Disease_Prdiction.cvs") 


In [2]:

X=df.drop(['Heart Disease'],axis=1)
y = df['Heart Disease']

In [3]:

selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
X = df[selected_features]


In [None]:

import pickle


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for production use
pickle.dump(scaler, open("scaler.pkl", "wb"))

In [5]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [6]:
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)



In [7]:

best_model = grid_search.best_estimator_


y_pred = best_model.predict(X_test)



In [8]:

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


In [9]:
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8518518518518519
Classification Report:
               precision    recall  f1-score   support

     Absence       0.89      0.83      0.86        30
    Presence       0.81      0.88      0.84        24

    accuracy                           0.85        54
   macro avg       0.85      0.85      0.85        54
weighted avg       0.86      0.85      0.85        54

Confusion Matrix:
 [[25  5]
 [ 3 21]]


In [10]:

sample_input = np.array([[63, 145, 233, 150, 2.3, 1, 0, 0.5, 3, 120]])  

sample_df = pd.DataFrame(sample_input, columns=selected_features)

sample_input_scaled = scaler.transform(sample_df)

prediction = best_model.predict(sample_input_scaled)
print("Predicted Heart Disease (0=No, 1=Yes):", prediction[0])


Predicted Heart Disease (0=No, 1=Yes): Presence


In [11]:
print("Selected Features:", selected_features.tolist())

Selected Features: ['Age', 'Sex', 'Chest pain type', 'EKG results', 'Max HR', 'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium']


In [12]:
yy=best_model.predict([[70,1,4,2,109,0,2.4,2,3,3]])

In [13]:
yy

array(['Presence'], dtype=object)

In [14]:
import pickle

In [15]:
pickle.dump(best_model, open("model.pkl", "wb"))