# Be careful!
Running whole notebook can be very time consuming. For me it was 30-40 minutes

In [218]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
import random
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.svm import SVC

Dataset description: https://www.kaggle.com/fedesoriano/heart-failure-prediction

In [145]:
df = pd.read_csv('heart.csv')

In [146]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [147]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [148]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


### Binarizing binary and categorical features

Sex, ExerciseAngina - are already binary features, but we should replace their values with 0,1.
For features ChestPainType, RestingECG and ST_Slope I applied one-hot encoding with pd.get_dummies

In [149]:
def preprocess(df):
    #Changing females to 0, males to 1
    df['Sex'] = df['Sex'].apply(lambda x: 0 if x == 'F' else 1)
    #Changing 'no' to 0, 'yes' to 1
    df['ExerciseAngina'] = df['ExerciseAngina'].apply(lambda x: 0 if x == 'N' else 1)
    
    # binarizing ChestPainType and deleting previous column
    one_hot = pd.get_dummies(df['ChestPainType'])
    df = df.join(one_hot)
    df.drop('ChestPainType', axis=1, inplace=True)
    # binarizing RestingECG and deleting previous column
    one_hot = pd.get_dummies(df['RestingECG'])
    df = df.join(one_hot)
    df.drop('RestingECG', axis=1, inplace=True)
    # binarizing ST_Slope and deleting previous column
    one_hot = pd.get_dummies(df['ST_Slope'])
    df = df.join(one_hot)
    df.drop('ST_Slope', axis=1, inplace=True)
    return df

In [150]:
df = preprocess(df)

In [151]:
df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ASY,ATA,NAP,TA,LVH,Normal,ST,Down,Flat,Up
0,40,1,140,289,0,172,0,0.0,0,0,1,0,0,0,1,0,0,0,1
1,49,0,160,180,0,156,0,1.0,1,0,0,1,0,0,1,0,0,1,0
2,37,1,130,283,0,98,0,0.0,0,0,1,0,0,0,0,1,0,0,1
3,48,0,138,214,0,108,1,1.5,1,1,0,0,0,0,1,0,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,0,1.2,1,0,0,0,1,0,1,0,0,1,0
914,68,1,144,193,1,141,0,3.4,1,1,0,0,0,0,1,0,0,1,0
915,57,1,130,131,0,115,1,1.2,1,1,0,0,0,0,1,0,0,1,0
916,57,0,130,236,0,174,0,0.0,1,0,1,0,0,1,0,0,0,1,0


In [152]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    int64  
 2   RestingBP       918 non-null    int64  
 3   Cholesterol     918 non-null    int64  
 4   FastingBS       918 non-null    int64  
 5   MaxHR           918 non-null    int64  
 6   ExerciseAngina  918 non-null    int64  
 7   Oldpeak         918 non-null    float64
 8   HeartDisease    918 non-null    int64  
 9   ASY             918 non-null    uint8  
 10  ATA             918 non-null    uint8  
 11  NAP             918 non-null    uint8  
 12  TA              918 non-null    uint8  
 13  LVH             918 non-null    uint8  
 14  Normal          918 non-null    uint8  
 15  ST              918 non-null    uint8  
 16  Down            918 non-null    uint8  
 17  Flat            918 non-null    uin

## Binarizing quantitative features

In this section I binarized Age, RestingBP, Cholesterol, MaxHR and Oldpeak, by firstly categorizing them based on their histogram and then using one-hot encoding

In [153]:
fig = px.histogram(df['Age'], x="Age", nbins=30)
fig.show()

In [154]:
def categorize_age(x):
    if x < 38:
        return 0
    elif x < 48:
        return 1
    elif x < 54:
        return 2
    elif x < 50:
        return 3
    elif x < 66:
        return 4
    else:
        return 5

In [155]:
df['new_age'] = df['Age'].apply(lambda x: categorize_age(x))

In [156]:
df['new_age'].value_counts()

4    416
1    190
2    181
5     82
0     49
Name: new_age, dtype: int64

In [157]:
one_hot = pd.get_dummies(df['new_age'])
one_hot = one_hot.rename(columns={0: 'age_0', 1: 'age_1', 2: 'age_2', 3: 'age_3', 4: 'age_4', 5: 'age_5'})
df = df.join(one_hot)
df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ASY,...,ST,Down,Flat,Up,new_age,age_0,age_1,age_2,age_4,age_5
0,40,1,140,289,0,172,0,0.0,0,0,...,0,0,0,1,1,0,1,0,0,0
1,49,0,160,180,0,156,0,1.0,1,0,...,0,0,1,0,2,0,0,1,0,0
2,37,1,130,283,0,98,0,0.0,0,0,...,1,0,0,1,0,1,0,0,0,0
3,48,0,138,214,0,108,1,1.5,1,1,...,0,0,1,0,2,0,0,1,0,0
4,54,1,150,195,0,122,0,0.0,0,0,...,0,0,0,1,4,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,0,1.2,1,0,...,0,0,1,0,1,0,1,0,0,0
914,68,1,144,193,1,141,0,3.4,1,1,...,0,0,1,0,5,0,0,0,0,1
915,57,1,130,131,0,115,1,1.2,1,1,...,0,0,1,0,4,0,0,0,1,0
916,57,0,130,236,0,174,0,0.0,1,0,...,0,0,1,0,4,0,0,0,1,0


In [158]:
fig = px.histogram(df['RestingBP'], x="RestingBP", nbins=30)
fig.show()

In [159]:
def categorize_restingbp(x):
    if x < 120:
        return 0
    elif x < 140:
        return 1
    else:
        return 2

In [160]:
df['new_restingBP'] = df['RestingBP'].apply(lambda x: categorize_restingbp(x))

In [161]:
df['new_restingBP'].value_counts()

1    430
2    327
0    161
Name: new_restingBP, dtype: int64

In [162]:
one_hot = pd.get_dummies(df['new_restingBP'])
one_hot = one_hot.rename(columns={0: 'restingBP_0', 1: 'restingBP_1', 2: 'restingBP_2'})
df = df.join(one_hot)
df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ASY,...,new_age,age_0,age_1,age_2,age_4,age_5,new_restingBP,restingBP_0,restingBP_1,restingBP_2
0,40,1,140,289,0,172,0,0.0,0,0,...,1,0,1,0,0,0,2,0,0,1
1,49,0,160,180,0,156,0,1.0,1,0,...,2,0,0,1,0,0,2,0,0,1
2,37,1,130,283,0,98,0,0.0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,48,0,138,214,0,108,1,1.5,1,1,...,2,0,0,1,0,0,1,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,...,4,0,0,0,1,0,2,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,0,1.2,1,0,...,1,0,1,0,0,0,0,1,0,0
914,68,1,144,193,1,141,0,3.4,1,1,...,5,0,0,0,0,1,2,0,0,1
915,57,1,130,131,0,115,1,1.2,1,1,...,4,0,0,0,1,0,1,0,1,0
916,57,0,130,236,0,174,0,0.0,1,0,...,4,0,0,0,1,0,1,0,1,0


In [163]:
fig = px.histogram(df['Cholesterol'], x="Cholesterol", nbins=30)
fig.show()

In [164]:
def categorize_cholesterol(x):
    if x < 100:
        return 0
    elif x < 200:
        return 1
    elif x < 250:
        return 2
    else:
        return 3

In [165]:
df['new_cholesterol'] = df['Cholesterol'].apply(lambda x: categorize_cholesterol(x))

In [166]:
df['new_cholesterol'].value_counts()

3    310
2    290
0    173
1    145
Name: new_cholesterol, dtype: int64

In [167]:
one_hot = pd.get_dummies(df['new_cholesterol'])
one_hot = one_hot.rename(columns={0: 'cholesterol_0', 1: 'cholesterol_1', 2: 'cholesterol_2', 3: 'cholesterol_3'})
df = df.join(one_hot)
df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ASY,...,age_5,new_restingBP,restingBP_0,restingBP_1,restingBP_2,new_cholesterol,cholesterol_0,cholesterol_1,cholesterol_2,cholesterol_3
0,40,1,140,289,0,172,0,0.0,0,0,...,0,2,0,0,1,3,0,0,0,1
1,49,0,160,180,0,156,0,1.0,1,0,...,0,2,0,0,1,1,0,1,0,0
2,37,1,130,283,0,98,0,0.0,0,0,...,0,1,0,1,0,3,0,0,0,1
3,48,0,138,214,0,108,1,1.5,1,1,...,0,1,0,1,0,2,0,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,...,0,2,0,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,0,1.2,1,0,...,0,0,1,0,0,3,0,0,0,1
914,68,1,144,193,1,141,0,3.4,1,1,...,1,2,0,0,1,1,0,1,0,0
915,57,1,130,131,0,115,1,1.2,1,1,...,0,1,0,1,0,1,0,1,0,0
916,57,0,130,236,0,174,0,0.0,1,0,...,0,1,0,1,0,2,0,0,1,0


In [168]:
fig = px.histogram(df['MaxHR'], x="MaxHR", nbins=30)
fig.show()

In [169]:
def categorize_maxhr(x):
    if x < 95:
        return 0
    elif x < 110:
        return 1
    elif x < 120:
        return 2
    elif x < 140:
        return 3
    elif x < 150:
        return 4
    elif x < 160:
        return 5
    elif x < 170:
        return 6
    else:
        return 7

In [170]:
df['new_maxhr'] = df['MaxHR'].apply(lambda x: categorize_maxhr(x))

In [171]:
df['new_maxhr'].value_counts()

3    246
4    126
5    120
7    110
2    100
6     91
1     82
0     43
Name: new_maxhr, dtype: int64

In [172]:
one_hot = pd.get_dummies(df['new_maxhr'])
one_hot = one_hot.rename(columns={0: 'maxhr_0', 1: 'maxhr_1', 2: 'maxhr_2', 3: 'maxhr_3', 4: 'maxhr_4', 5: 'maxhr_5', 6: 'maxhr_6', 7: 'maxhr_7'})
df = df.join(one_hot)
df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ASY,...,cholesterol_3,new_maxhr,maxhr_0,maxhr_1,maxhr_2,maxhr_3,maxhr_4,maxhr_5,maxhr_6,maxhr_7
0,40,1,140,289,0,172,0,0.0,0,0,...,1,7,0,0,0,0,0,0,0,1
1,49,0,160,180,0,156,0,1.0,1,0,...,0,5,0,0,0,0,0,1,0,0
2,37,1,130,283,0,98,0,0.0,0,0,...,1,1,0,1,0,0,0,0,0,0
3,48,0,138,214,0,108,1,1.5,1,1,...,0,1,0,1,0,0,0,0,0,0
4,54,1,150,195,0,122,0,0.0,0,0,...,0,3,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,0,1.2,1,0,...,1,3,0,0,0,1,0,0,0,0
914,68,1,144,193,1,141,0,3.4,1,1,...,0,4,0,0,0,0,1,0,0,0
915,57,1,130,131,0,115,1,1.2,1,1,...,0,2,0,0,1,0,0,0,0,0
916,57,0,130,236,0,174,0,0.0,1,0,...,0,7,0,0,0,0,0,0,0,1


In [173]:
fig = px.histogram(df['Oldpeak'], x="Oldpeak", nbins=30)
fig.show()

In [174]:
def categorize_oldpeak(x):
    if x < 0.8:
        return 0
    elif x < 1.8:
        return 1
    else:
        return 2

In [175]:
df['new_oldpeak'] = df['Oldpeak'].apply(lambda x: categorize_oldpeak(x))

In [176]:
df['new_oldpeak'].value_counts()

0    479
1    239
2    200
Name: new_oldpeak, dtype: int64

In [177]:
one_hot = pd.get_dummies(df['new_oldpeak'])
one_hot = one_hot.rename(columns={0: 'oldpeak_0', 1: 'oldpeak_1', 2: 'oldpeak_2'})
df = df.join(one_hot)
df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ASY,...,maxhr_2,maxhr_3,maxhr_4,maxhr_5,maxhr_6,maxhr_7,new_oldpeak,oldpeak_0,oldpeak_1,oldpeak_2
0,40,1,140,289,0,172,0,0.0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,49,0,160,180,0,156,0,1.0,1,0,...,0,0,0,1,0,0,1,0,1,0
2,37,1,130,283,0,98,0,0.0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,48,0,138,214,0,108,1,1.5,1,1,...,0,0,0,0,0,0,1,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,0,1.2,1,0,...,0,1,0,0,0,0,1,0,1,0
914,68,1,144,193,1,141,0,3.4,1,1,...,0,0,1,0,0,0,2,0,0,1
915,57,1,130,131,0,115,1,1.2,1,1,...,1,0,0,0,0,0,1,0,1,0
916,57,0,130,236,0,174,0,0.0,1,0,...,0,0,0,0,0,1,0,1,0,0


### Deleting residual quantitative features

In [178]:
df_binary = df.drop(columns=['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'new_age', 'new_restingBP', 'new_cholesterol', 'new_maxhr', 'new_oldpeak'])

In [179]:
df_binary

Unnamed: 0,Sex,FastingBS,ExerciseAngina,HeartDisease,ASY,ATA,NAP,TA,LVH,Normal,...,maxhr_1,maxhr_2,maxhr_3,maxhr_4,maxhr_5,maxhr_6,maxhr_7,oldpeak_0,oldpeak_1,oldpeak_2
0,1,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
1,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,0
2,1,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,0,0,1,1,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,1,0,0,1,0,0,0,1,0,1,...,0,0,1,0,0,0,0,0,1,0
914,1,1,0,1,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
915,1,0,1,1,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
916,0,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,1,1,0,0


In [180]:
y_binary = df_binary['HeartDisease']
x_binary = df_binary.drop('HeartDisease', axis=1)

### Renaming features with alphabet for debug convenience 

In [182]:
x_binary.rename(columns={'Sex': 'A_Sex', 'FastingBS': 'B_FastingBS', 'ExerciseAngina': 'C_ExerciseAngina', 'ASY': 'D_ASY'}, inplace=True)

In [183]:
x_binary.rename(columns={'ATA': 'E_ATA', 'NAP': 'F_NAP', 'TA': 'G_TA', 'LVH': 'H_LVH', 'Normal': 'I_Normal', 'ST': 'J_ST'}, inplace=True)

In [184]:
x_binary.rename(columns={'Down': 'K_Down', 'Flat': 'L_Flat', 'Up': 'M_Up'}, inplace=True)

### Spliting on train, test

In [185]:
X_binary_train, X_binary_test, y_binary_train, y_binary_test = train_test_split(x_binary, y_binary, test_size=0.3)

### Prime function for features

In [207]:
def feature_prime(df, features):
    # For every feature removing objects from dataset which is 0 on this feature
    new_df = df[df[features[0]] == 1]
    for i in range(1, len(features)):
        new_df = new_df[new_df[features[i]] == 1]
    return new_df

### Prime function for objects

In [190]:
def object_prime(df, obj_indeces):
    new_df = df.copy()
    # For every feature: remove it, if it has 0 on given objects
    for feature in list(new_df.columns):
        stop = False
        for j in obj_indeces:
            if new_df.loc[j][feature] != 1:
                stop = True
                break
        if stop:
            new_df.drop(feature, axis=1, inplace=True)
    return new_df

In [191]:
def features_double_prime(df, features):
    return object_prime(df, list(feature_prime(df, features).index))

In [192]:
def columns_to_sorted_list(df):
    return sorted(list(df.columns))

In [193]:
def subtract_lists(a,b):
    return [item for item in a if item not in b]

In [194]:
def CbO(df):
    # column names
    columns = list(df.columns)
    # I keep only concept intents as lists
    all_concepts = []
    feature_num = 0
    # This list is for not using features that already has their main branches.
    # For example not adding feature 'a' to 'b' branch, when 'a' branch is fulle traversed
    used_features = []
    
    # loop for main branches
    for feature in columns:
        used_features.append(feature)
#         print('feature', feature)
        # In order to build tree depth first I keep stack of concept that will be processed next
        vertices_stack = []
        
        concept = features_double_prime(df, [feature])
        concept_list = columns_to_sorted_list(concept)
        # complement - sorted list of features that will be added next
        # Here I substract all features that are already in concept and features that already had their main branch
        complement = subtract_lists(list(df.columns), concept_list)
        complement = subtract_lists(complement, used_features)
        # Closed is features after square bracket
        closed = subtract_lists(concept_list, [feature])

        if closed:
            if closed[0] > feature:
                complement = subtract_lists(complement, closed)
            else:
                continue

        # stack elements consist of created concept and it's complement
        vertices_stack.append((concept_list, complement))
        all_concepts.append([concept_list])
        
        while vertices_stack:
            # Fixed list of features from parent concept
            fixed = vertices_stack[-1][0]
            # complement of parent concept
            complement = vertices_stack[-1][1]
            #If there are left features to add, then we create to_be_closed = first element of complement and also delete it
            if complement:
                to_be_closed = complement.pop(0)
            else:
            # if there is nothing in complment, then branch is processed and we delete grandparent concept by deleting parent concept
                vertices_stack.pop()
                continue
                
            new_fixed = fixed + [to_be_closed]
            concept = features_double_prime(df, new_fixed)
            #closed = features in concept except features that were in new_fixed
            closed = subtract_lists(columns_to_sorted_list(concept), new_fixed)
            # change parent concept's complement because we used that feature in this subbranch
            vertices_stack[-1] = (fixed, complement)
            
            if closed:
                # If canonical
                if closed[0] > to_be_closed:
                    concept2Add = sorted(new_fixed + closed)
                    new_complement = subtract_lists(complement, closed)
        
                    vertices_stack.append((concept2Add, new_complement))
                    all_concepts[feature_num].append(concept2Add)
                #If not canonical then we do nothing
            else:
                concept2Add = sorted(new_fixed)
                new_complement = subtract_lists(complement, closed)
#                 print('concept2Add', concept2Add, 'new_complement', new_complement)
        
                vertices_stack.append((concept2Add, new_complement))
                all_concepts[feature_num].append(concept2Add)
        feature_num += 1
    return all_concepts

### Splitting dataset

In [195]:
X_binary_train_pos = X_binary_train[y_binary_train==1]
X_binary_train_neg = X_binary_train[y_binary_train==0]

In [197]:
concepts_pos = CbO(X_binary_train_pos)

In [198]:
concepts_pos

[[['A_Sex'],
  ['A_Sex', 'B_FastingBS'],
  ['A_Sex', 'B_FastingBS', 'C_ExerciseAngina'],
  ['A_Sex', 'B_FastingBS', 'C_ExerciseAngina', 'D_ASY'],
  ['A_Sex',
   'B_FastingBS',
   'C_ExerciseAngina',
   'D_ASY',
   'E_ATA',
   'F_NAP',
   'G_TA',
   'H_LVH',
   'I_Normal',
   'J_ST',
   'K_Down',
   'L_Flat',
   'M_Up',
   'age_0',
   'age_1',
   'age_2',
   'age_4',
   'age_5',
   'cholesterol_0',
   'cholesterol_1',
   'cholesterol_2',
   'cholesterol_3',
   'maxhr_0',
   'maxhr_1',
   'maxhr_2',
   'maxhr_3',
   'maxhr_4',
   'maxhr_5',
   'maxhr_6',
   'maxhr_7',
   'oldpeak_0',
   'oldpeak_1',
   'oldpeak_2',
   'restingBP_0',
   'restingBP_1',
   'restingBP_2'],
  ['A_Sex', 'B_FastingBS', 'C_ExerciseAngina', 'D_ASY', 'H_LVH'],
  ['A_Sex',
   'B_FastingBS',
   'C_ExerciseAngina',
   'D_ASY',
   'H_LVH',
   'K_Down',
   'age_4',
   'cholesterol_3',
   'maxhr_5',
   'oldpeak_2',
   'restingBP_2'],
  ['A_Sex', 'B_FastingBS', 'C_ExerciseAngina', 'D_ASY', 'H_LVH', 'L_Flat'],
  ['A_Sex',

In [199]:
concepts_neg = CbO(X_binary_train_neg)

In [200]:
concepts_neg

[[['A_Sex'],
  ['A_Sex', 'B_FastingBS'],
  ['A_Sex', 'B_FastingBS', 'C_ExerciseAngina', 'I_Normal'],
  ['A_Sex',
   'B_FastingBS',
   'C_ExerciseAngina',
   'D_ASY',
   'I_Normal',
   'cholesterol_3',
   'restingBP_2'],
  ['A_Sex',
   'B_FastingBS',
   'C_ExerciseAngina',
   'D_ASY',
   'E_ATA',
   'F_NAP',
   'G_TA',
   'H_LVH',
   'I_Normal',
   'J_ST',
   'K_Down',
   'L_Flat',
   'M_Up',
   'age_0',
   'age_1',
   'age_2',
   'age_4',
   'age_5',
   'cholesterol_0',
   'cholesterol_1',
   'cholesterol_2',
   'cholesterol_3',
   'maxhr_0',
   'maxhr_1',
   'maxhr_2',
   'maxhr_3',
   'maxhr_4',
   'maxhr_5',
   'maxhr_6',
   'maxhr_7',
   'oldpeak_0',
   'oldpeak_1',
   'oldpeak_2',
   'restingBP_0',
   'restingBP_1',
   'restingBP_2'],
  ['A_Sex',
   'B_FastingBS',
   'C_ExerciseAngina',
   'D_ASY',
   'I_Normal',
   'K_Down',
   'age_5',
   'cholesterol_3',
   'maxhr_2',
   'oldpeak_2',
   'restingBP_2'],
  ['A_Sex',
   'B_FastingBS',
   'C_ExerciseAngina',
   'D_ASY',
   'I_Norma

In [201]:
def concepts_to_list(concepts):
    concept_list = []
    for i in range(len(concepts)):
        for j in range(len(concepts[i])):
            concept_list.append(concepts[i][j])
    return concept_list

In [202]:
pos_concept_list = concepts_to_list(concepts_pos)

In [203]:
neg_concept_list = concepts_to_list(concepts_neg)

In [204]:
def get_hypotheses(df, concept_list):
    hypotheses = []
    for i in concept_list:
        hypo_falsified = 0
        for j in df.index:
            array = df.loc[j][i]
            #if sum of elements equals to size of array, then hypothesis is falsified
            if array.sum() == array.shape[0]:
                hypo_falsified = 1
                break
        if not hypo_falsified:
            hypotheses.append(i)
    return hypotheses

In [205]:
pos_hypotheses = get_hypotheses(X_binary_train_neg, pos_concept_list)

In [206]:
pos_hypotheses

[['A_Sex',
  'B_FastingBS',
  'C_ExerciseAngina',
  'D_ASY',
  'E_ATA',
  'F_NAP',
  'G_TA',
  'H_LVH',
  'I_Normal',
  'J_ST',
  'K_Down',
  'L_Flat',
  'M_Up',
  'age_0',
  'age_1',
  'age_2',
  'age_4',
  'age_5',
  'cholesterol_0',
  'cholesterol_1',
  'cholesterol_2',
  'cholesterol_3',
  'maxhr_0',
  'maxhr_1',
  'maxhr_2',
  'maxhr_3',
  'maxhr_4',
  'maxhr_5',
  'maxhr_6',
  'maxhr_7',
  'oldpeak_0',
  'oldpeak_1',
  'oldpeak_2',
  'restingBP_0',
  'restingBP_1',
  'restingBP_2'],
 ['A_Sex', 'B_FastingBS', 'C_ExerciseAngina', 'D_ASY', 'H_LVH'],
 ['A_Sex',
  'B_FastingBS',
  'C_ExerciseAngina',
  'D_ASY',
  'H_LVH',
  'K_Down',
  'age_4',
  'cholesterol_3',
  'maxhr_5',
  'oldpeak_2',
  'restingBP_2'],
 ['A_Sex', 'B_FastingBS', 'C_ExerciseAngina', 'D_ASY', 'H_LVH', 'L_Flat'],
 ['A_Sex',
  'B_FastingBS',
  'C_ExerciseAngina',
  'D_ASY',
  'H_LVH',
  'L_Flat',
  'age_2',
  'cholesterol_1',
  'maxhr_3',
  'oldpeak_1',
  'restingBP_1'],
 ['A_Sex',
  'B_FastingBS',
  'C_ExerciseAngin

In [208]:
neg_hypotheses = get_hypotheses(X_binary_train_pos, neg_concept_list)

In [209]:
neg_hypotheses

[['A_Sex',
  'B_FastingBS',
  'C_ExerciseAngina',
  'D_ASY',
  'E_ATA',
  'F_NAP',
  'G_TA',
  'H_LVH',
  'I_Normal',
  'J_ST',
  'K_Down',
  'L_Flat',
  'M_Up',
  'age_0',
  'age_1',
  'age_2',
  'age_4',
  'age_5',
  'cholesterol_0',
  'cholesterol_1',
  'cholesterol_2',
  'cholesterol_3',
  'maxhr_0',
  'maxhr_1',
  'maxhr_2',
  'maxhr_3',
  'maxhr_4',
  'maxhr_5',
  'maxhr_6',
  'maxhr_7',
  'oldpeak_0',
  'oldpeak_1',
  'oldpeak_2',
  'restingBP_0',
  'restingBP_1',
  'restingBP_2'],
 ['A_Sex',
  'B_FastingBS',
  'C_ExerciseAngina',
  'D_ASY',
  'I_Normal',
  'K_Down',
  'age_5',
  'cholesterol_3',
  'maxhr_2',
  'oldpeak_2',
  'restingBP_2'],
 ['A_Sex',
  'B_FastingBS',
  'C_ExerciseAngina',
  'D_ASY',
  'I_Normal',
  'M_Up',
  'age_1',
  'cholesterol_3',
  'maxhr_3',
  'oldpeak_0',
  'restingBP_2'],
 ['A_Sex',
  'B_FastingBS',
  'C_ExerciseAngina',
  'F_NAP',
  'I_Normal',
  'L_Flat',
  'age_4',
  'cholesterol_2',
  'maxhr_3',
  'oldpeak_1',
  'restingBP_2'],
 ['A_Sex',
  'B_Fas

In [210]:
def make_predictions(x_test, pos_hypotheses, neg_hypotheses):
    y_pred = []
    # Number of objects, which were unknown or contradiction
    rand_count = 0
    # indices of objects, which were predicted (NOT unknown or contradiction)
    pred_indices = []
    for i in x_test.index:
        pos_count = 0
        neg_count = 0
        for j in pos_hypotheses:
            array = x_test.loc[i][j]
            if array.sum() == array.shape[0]:
                pos_count += 1
        for j in neg_hypotheses:
            array = x_test.loc[i][j]
            if array.sum() == array.shape[0]:
                neg_count += 1
        if not pos_count and not neg_count:
            y_pred.append(random.randint(0, 1))
            rand_count += 1
        elif pos_count == neg_count:
            y_pred.append(random.randint(0, 1))
            rand_count += 1
        elif pos_count > neg_count:
            y_pred.append(1)
            pred_indices.append(i)
        elif pos_count < neg_count:
            y_pred.append(0)
            pred_indices.append(i)
    print(rand_count)
    return y_pred, pred_indices

Only 5 objects were random

In [211]:
y_pred_jsm, pred_indices = make_predictions(X_binary_test, pos_hypotheses, neg_hypotheses)

5


y_only_pred_jsm is a list which contains only predicted objects (not unknown or contradiction). I used this because first my classifier couldn't beat naive bayes in metrics. But then I as shown above binarized quantitative and everything improved. It is because number of features increased by more than 10, so there were more hypotheses. And also quantitative features were useful. For example age, it is obvious that older people have high risk of heart failure.

Disadvantage of binarizing quantitative fatures is that time for computation increased greatly

In [212]:
y_only_pred_jsm = []
indices = list(y_binary_test.index)
for i in range(len(y_pred_jsm)):
    if indices[i] in pred_indices:
        y_only_pred_jsm.append(y_pred_jsm[i])

## Metrics for with all objects

In [213]:
print(classification_report(y_binary_test, y_pred_jsm))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82       118
           1       0.88      0.85      0.87       158

    accuracy                           0.85       276
   macro avg       0.84      0.85      0.85       276
weighted avg       0.85      0.85      0.85       276



In [214]:
accuracy_score(y_binary_test, y_pred_jsm)

0.8478260869565217

## Metrics for without contradiction and unknown objects

In [215]:
print(classification_report(y_binary_test.loc[pred_indices], y_only_pred_jsm))

              precision    recall  f1-score   support

           0       0.82      0.85      0.84       116
           1       0.89      0.86      0.88       155

    accuracy                           0.86       271
   macro avg       0.86      0.86      0.86       271
weighted avg       0.86      0.86      0.86       271



In [222]:
accuracy_score(y_binary_test.loc[pred_indices], y_only_pred_jsm)

0.8597785977859779

## Metrics of naive bayes

In [223]:
gnb = GaussianNB()
y_pred_B = gnb.fit(X_binary_train, y_binary_train).predict(X_binary_test)

In [224]:
accuracy_score(y_binary_test, y_pred_B)

0.8115942028985508

In [225]:
print(classification_report(y_binary_test, y_pred_B))

              precision    recall  f1-score   support

           0       0.74      0.86      0.80       118
           1       0.88      0.78      0.83       158

    accuracy                           0.81       276
   macro avg       0.81      0.82      0.81       276
weighted avg       0.82      0.81      0.81       276



## Metrics for SVM

In [226]:
clf = SVC(gamma='auto')
y_pred_svm = clf.fit(X_binary_train, y_binary_train).predict(X_binary_test)

In [227]:
accuracy_score(y_binary_test, y_pred_svm)

0.8623188405797102

In [228]:
print(classification_report(y_binary_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.81      0.88      0.85       118
           1       0.91      0.85      0.88       158

    accuracy                           0.86       276
   macro avg       0.86      0.86      0.86       276
weighted avg       0.87      0.86      0.86       276

