Below we import necessary libraries to use for training

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
#for visualization
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

Here, we load the dataset and show the first 4 lines 

In [4]:
disease= pd.read_csv('heart.csv')

disease.head(4)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1


We check the columns to see if there are any missing values

In [5]:
disease.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [6]:
print(disease.dtypes)

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object


In [7]:
disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


Below we import the encoder to assign strings to numbers to easily be read 

In [8]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()

In [9]:
disease['Sex']=labelencoder.fit_transform(disease['Sex'])
disease['ChestPainType']=labelencoder.fit_transform(disease['ChestPainType'])
disease['RestingECG']=labelencoder.fit_transform(disease['RestingECG'])
disease['ExerciseAngina']=labelencoder.fit_transform(disease['ExerciseAngina'])
disease['ST_Slope']=labelencoder.fit_transform(disease['ST_Slope'])

In [10]:
X = disease[['Age','Sex', 'ChestPainType','RestingBP','Cholesterol','FastingBS','RestingECG','MaxHR','ExerciseAngina','Oldpeak','ST_Slope']] #predictor
y = disease['HeartDisease']

In [11]:
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=42)

In [12]:
disease.head(3)
# disease.info()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0


In [13]:
disease.dtypes

Age                 int64
Sex                 int64
ChestPainType       int64
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG          int64
MaxHR               int64
ExerciseAngina      int64
Oldpeak           float64
ST_Slope            int64
HeartDisease        int64
dtype: object

In [14]:
X_test

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
668,63,0,1,140,195,0,1,179,0,0.0,2
30,53,1,2,145,518,0,1,130,0,0.0,1
377,65,1,0,160,0,1,2,122,0,1.2,1
535,56,1,0,130,0,0,0,122,1,1.0,1
807,54,1,1,108,309,0,1,156,0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...
211,50,0,2,140,288,0,1,140,1,0.0,1
745,63,0,0,108,269,0,1,169,1,1.8,1
584,64,1,0,141,244,1,2,116,1,1.5,1
878,49,1,1,130,266,0,1,171,0,0.6,2


In [15]:
model=LogisticRegression()
model.fit(X_train, y_train)

In [23]:
y_pred=model.predict(X_test)

In [22]:
# modeell={'logistic_regression' : {
#         'model': LogisticRegression(solver='liblinear',multi_class='auto'),
#         'params': {
#             'C': [1,5,10]
#         }
# }
# }

In [19]:
from sklearn.model_selection import GridSearchCV

In [31]:


clf = GridSearchCV(LogisticRegression(), {
    'C': [1,5,10],
    # 'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
clf.fit(X, y)
clf.cv_results_

{'mean_fit_time': array([0.04068809, 0.04187927, 0.0390521 ]),
 'std_fit_time': array([0.00672528, 0.00844456, 0.02304109]),
 'mean_score_time': array([0.00324636, 0.        , 0.01255646]),
 'std_score_time': array([0.00182953, 0.        , 0.00599435]),
 'param_C': masked_array(data=[1, 5, 10],
              mask=[False, False, False],
        fill_value=999999),
 'params': [{'C': 1}, {'C': 5}, {'C': 10}],
 'split0_test_score': array([0.81521739, 0.81521739, 0.81521739]),
 'split1_test_score': array([0.80978261, 0.81521739, 0.81521739]),
 'split2_test_score': array([0.83152174, 0.80978261, 0.82065217]),
 'split3_test_score': array([0.81967213, 0.81967213, 0.81967213]),
 'split4_test_score': array([0.77595628, 0.76502732, 0.76502732]),
 'mean_test_score': array([0.81043003, 0.80498337, 0.80715728]),
 'std_test_score': array([0.01866576, 0.0202225 , 0.021183  ]),
 'rank_test_score': array([1, 3, 2], dtype=int32)}

In [29]:
df=pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.039858,0.011147,0.005444,0.003617,1,{'C': 1},0.815217,0.809783,0.831522,0.819672,0.775956,0.81043,0.018666,1
1,0.032889,0.010835,0.011306,0.007774,5,{'C': 5},0.815217,0.815217,0.809783,0.819672,0.765027,0.804983,0.020223,3
2,0.026263,0.008933,0.009006,0.006668,10,{'C': 10},0.815217,0.815217,0.820652,0.819672,0.765027,0.807157,0.021183,2


In [30]:
accuracy=accuracy_score(y_test,y_pred)
classification_rep= classification_report(y_test, y_pred)
print(accuracy * 100)
print(classification_rep)

84.23913043478261
              precision    recall  f1-score   support

           0       0.78      0.87      0.82        77
           1       0.90      0.82      0.86       107

    accuracy                           0.84       184
   macro avg       0.84      0.85      0.84       184
weighted avg       0.85      0.84      0.84       184

