In [21]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [2]:
path = r'dataset/heart.csv'
data = pd.read_csv(path)

In [3]:
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [4]:
(data == 0).sum()

Age                 0
Sex                 0
ChestPainType       0
RestingBP           1
Cholesterol       172
FastingBS         704
RestingECG          0
MaxHR               0
ExerciseAngina      0
Oldpeak           368
ST_Slope            0
HeartDisease      410
dtype: int64

In [5]:
data['Cholesterol'] = data['Cholesterol'].replace(0, np.nan)

In [6]:
to_KNN = KNNImputer(n_neighbors=5)
data['Cholesterol'] = to_KNN.fit_transform(data[['Cholesterol']])

In [7]:
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264.0,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193.0,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131.0,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236.0,0,LVH,174,N,0.0,Flat,1


In [8]:
(data == 0).sum()

Age                 0
Sex                 0
ChestPainType       0
RestingBP           1
Cholesterol         0
FastingBS         704
RestingECG          0
MaxHR               0
ExerciseAngina      0
Oldpeak           368
ST_Slope            0
HeartDisease      410
dtype: int64

In [9]:
to_OHE = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ExerciseAngina', 'ST_Slope', ]
data_encoded = pd.concat([data.drop(columns=to_OHE), pd.get_dummies(data[to_OHE])], axis=1)

In [10]:
data_encoded

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ExerciseAngina_N.1,ExerciseAngina_Y.1,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289.0,0,172,0.0,0,False,True,False,...,False,True,False,True,False,True,False,False,False,True
1,49,160,180.0,0,156,1.0,1,True,False,False,...,False,True,False,True,False,True,False,False,True,False
2,37,130,283.0,0,98,0.0,0,False,True,False,...,False,False,True,True,False,True,False,False,False,True
3,48,138,214.0,0,108,1.5,1,True,False,True,...,False,True,False,False,True,False,True,False,True,False
4,54,150,195.0,0,122,0.0,0,False,True,False,...,False,True,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264.0,0,132,1.2,1,False,True,False,...,False,True,False,True,False,True,False,False,True,False
914,68,144,193.0,1,141,3.4,1,False,True,True,...,False,True,False,True,False,True,False,False,True,False
915,57,130,131.0,0,115,1.2,1,False,True,True,...,False,True,False,False,True,False,True,False,True,False
916,57,130,236.0,0,174,0.0,1,True,False,False,...,True,False,False,True,False,True,False,False,True,False


In [11]:
data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   RestingBP          918 non-null    int64  
 2   Cholesterol        918 non-null    float64
 3   FastingBS          918 non-null    int64  
 4   MaxHR              918 non-null    int64  
 5   Oldpeak            918 non-null    float64
 6   HeartDisease       918 non-null    int64  
 7   Sex_F              918 non-null    bool   
 8   Sex_M              918 non-null    bool   
 9   ChestPainType_ASY  918 non-null    bool   
 10  ChestPainType_ATA  918 non-null    bool   
 11  ChestPainType_NAP  918 non-null    bool   
 12  ChestPainType_TA   918 non-null    bool   
 13  RestingECG_LVH     918 non-null    bool   
 14  RestingECG_Normal  918 non-null    bool   
 15  RestingECG_ST      918 non-null    bool   
 16  ExerciseAngina_N   918 non

In [12]:
columns_name = data_encoded.columns.to_list()
to_MMS = MinMaxScaler()
data_encoded[columns_name] = to_MMS.fit_transform(data_encoded[columns_name])

In [13]:
data_encoded

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ExerciseAngina_N.1,ExerciseAngina_Y.1,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0.244898,0.70,0.393822,0.0,0.788732,0.295455,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.428571,0.80,0.183398,0.0,0.676056,0.409091,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.183673,0.65,0.382239,0.0,0.267606,0.295455,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.408163,0.69,0.249035,0.0,0.338028,0.465909,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.530612,0.75,0.212355,0.0,0.436620,0.295455,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,0.346939,0.55,0.345560,0.0,0.507042,0.431818,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
914,0.816327,0.72,0.208494,1.0,0.570423,0.681818,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
915,0.591837,0.65,0.088803,0.0,0.387324,0.431818,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
916,0.591837,0.65,0.291506,0.0,0.802817,0.295455,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [14]:
file_path = r'dataset/processed_data-heart.csv'
data_encoded.to_csv(file_path, index=False)

print(f"داده‌های پردازش شده با موفقیت ذخیره شدند در: {file_path}")

داده‌های پردازش شده با موفقیت ذخیره شدند در: dataset/processed_data-heart.csv


In [15]:
x = data_encoded.drop('HeartDisease', axis=1)
y = data_encoded['HeartDisease']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [16]:
Model_svc = SVC(kernel='linear', C=1.0, gamma=0.1)
Model_svc.fit(x_train, y_train)

In [17]:
y_pred = Model_svc.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.81      0.74      0.78        77
         1.0       0.82      0.88      0.85       107

    accuracy                           0.82       184
   macro avg       0.82      0.81      0.81       184
weighted avg       0.82      0.82      0.82       184



In [18]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
    'gamma': [0.0001, 0.001, 0.01, 0.1, 2, 10, 100, 1000, 10000],
    'kernel': ['linear']
}

model_svc = SVC()

grid_search = GridSearchCV(estimator=model_svc, param_grid=param_grid, scoring='f1_macro', cv=10, n_jobs=-1)

# انجام جستجو بر روی داده‌های آموزشی
grid_search.fit(x_train, y_train)

# نمایش بهترین مقادیر پارامترها
print("بهترین مقادیر پارامترها:", grid_search.best_params_)

# استفاده از مدل با بهترین پارامترها بر روی داده‌های تست
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

print("دقت مدل با بهترین پارامترها:")
print(accuracy_score(y_test, y_pred))

# نمایش گزارش بازخوانی
print("گزارش بازخوانی:")
print(classification_report(y_test, y_pred))


بهترین مقادیر پارامترها: {'C': 10000, 'gamma': 0.0001, 'kernel': 'linear'}
دقت مدل با بهترین پارامترها:
0.8206521739130435
گزارش بازخوانی:
              precision    recall  f1-score   support

         0.0       0.81      0.74      0.78        77
         1.0       0.82      0.88      0.85       107

    accuracy                           0.82       184
   macro avg       0.82      0.81      0.81       184
weighted avg       0.82      0.82      0.82       184



In [19]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(x_train, y_train)

y_pred = rf_classifier.predict(x_test)

print("دقت مدل:")
print(accuracy_score(y_test, y_pred))
print("گزارش دقت:")
print(classification_report(y_test, y_pred))

دقت مدل:
0.8586956521739131
گزارش دقت:
              precision    recall  f1-score   support

         0.0       0.86      0.79      0.82        77
         1.0       0.86      0.91      0.88       107

    accuracy                           0.86       184
   macro avg       0.86      0.85      0.85       184
weighted avg       0.86      0.86      0.86       184



In [20]:

rf_classifier = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200,300,500],
    'max_depth': [None, 10, 20, 30,50,80,100],
    'min_samples_split': [2, 5, 10,20,30],
    'min_samples_leaf': [1, 2, 4,6,8,10,20],
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring='f1_macro', cv=10, n_jobs=-1)

grid_search.fit(x_train, y_train)

print("بهترین مقادیر پارامترها:", grid_search.best_params_)

best_rf_classifier = grid_search.best_estimator_
y_pred = best_rf_classifier.predict(x_test)

print("دقت مدل با بهترین پارامترها:")
print(accuracy_score(y_test, y_pred))

print("گزارش دقت مدل با بهترین پارامترها:")
print(classification_report(y_test, y_pred))


بهترین مقادیر پارامترها: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
دقت مدل با بهترین پارامترها:
0.8586956521739131
گزارش دقت مدل با بهترین پارامترها:
              precision    recall  f1-score   support

         0.0       0.86      0.79      0.82        77
         1.0       0.86      0.91      0.88       107

    accuracy                           0.86       184
   macro avg       0.86      0.85      0.85       184
weighted avg       0.86      0.86      0.86       184

