In [1]:
import fcalc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import time
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Binarized data

In [2]:
column_names = [
        'top-left-square', 'top-middle-square', 'top-right-square',
        'middle-left-square', 'middle-middle-square', 'middle-right-square',
        'bottom-left-square', 'bottom-middle-square', 'bottom-right-square',
        'Class'
    ]
df = pd.read_csv('data_sets/tic-tac-toe.data', names = column_names)
df['Class'] = [x == 'positive' for x in df['Class']]
df.head()

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,Class
0,x,x,x,x,o,o,x,o,o,True
1,x,x,x,x,o,o,o,x,o,True
2,x,x,x,x,o,o,o,o,x,True
3,x,x,x,x,o,o,o,b,b,True
4,x,x,x,x,o,o,b,o,b,True


In [3]:
X = pd.get_dummies(df[column_names[:-1]], prefix=column_names[:-1]).astype(bool)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [4]:
bin_cls = fcalc.classifier.BinarizedBinaryClassifier(X_train.values, y_train.to_numpy(), method="standard-support")

In [5]:
bin_cls.predict(X_test.values)

In [6]:
from sklearn.metrics import accuracy_score, f1_score

print(accuracy_score(y_test, bin_cls.predictions))
print(f1_score(y_test, bin_cls.predictions))

0.9965277777777778
0.9974160206718347


# Pattern structures

In [2]:
column_names = [
        'top-left-square', 'top-middle-square', 'top-right-square',
        'middle-left-square', 'middle-middle-square', 'middle-right-square',
        'bottom-left-square', 'bottom-middle-square', 'bottom-right-square',
        'Class'
    ]
df = pd.read_csv('data_sets/tic-tac-toe.data', names = column_names)
df['Class'] = [x == 'positive' for x in df['Class']]
df.head()

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,Class
0,x,x,x,x,o,o,x,o,o,True
1,x,x,x,x,o,o,o,x,o,True
2,x,x,x,x,o,o,o,o,x,True
3,x,x,x,x,o,o,o,b,b,True
4,x,x,x,x,o,o,b,o,b,True


In [3]:
X = df[column_names[:-1]]
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy(), 
                                             categorical=np.arange(X_train.shape[1]))

In [5]:
pat_cls.predict(X_test.values)

(288, 433) (288, 237)


In [6]:
pat_cls.support[0][:,0].shape

(2, 433)

In [7]:
from sklearn.metrics import accuracy_score, f1_score

print(accuracy_score(y_test, pat_cls.predictions))
print(f1_score(y_test, pat_cls.predictions))

0.9930555555555556
0.9948453608247423


In [8]:
df = pd.read_csv('data_sets/iris.data', names=['sepal_length',	'sepal_width',	'petal_length',	'petal_width','species'])
df['species'] = [x == 'Iris-setosa' for x in df['species']]
df.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
136,6.3,3.4,5.6,2.4,False
114,5.8,2.8,5.1,2.4,False
116,6.5,3.0,5.5,1.8,False
126,6.2,2.8,4.8,1.8,False
97,6.2,2.9,4.3,1.3,False
74,6.4,2.9,4.3,1.3,False
1,4.9,3.0,1.4,0.2,True
113,5.7,2.5,5.0,2.0,False
135,7.7,3.0,6.1,2.3,False
94,5.6,2.7,4.2,1.3,False


In [9]:
X = df.iloc[:,:-1]
y = df['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy())

In [11]:
pat_cls.predict(X_test.values)

In [12]:
from sklearn.metrics import accuracy_score, f1_score
print("accuracy:",round(accuracy_score(y_test, pat_cls.predictions),4))
print("f1 score:",round(f1_score(y_test, pat_cls.predictions),4))

accuracy: 1.0
f1 score: 1.0


In [13]:
df = pd.read_csv('data_sets/heart_failure_clinical_records_dataset.csv')
df.sample(5)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
119,86.0,0,582,0,38,0,263358.03,1.83,134,0,0,95,1
174,65.0,0,198,1,35,1,281000.0,0.9,137,1,1,146,0
246,55.0,0,2017,0,25,0,314000.0,1.1,138,1,0,214,1
146,52.0,0,132,0,30,0,218000.0,0.7,136,1,1,112,0
283,65.0,0,1688,0,38,0,263358.03,1.1,138,1,1,250,0


In [14]:
X = df.iloc[:,:-1]
y = df['DEATH_EVENT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy(), 
                                                   categorical=np.array([1,3,5,9,10]))

In [16]:
pat_cls.predict(X_test.values)

In [17]:
from sklearn.metrics import accuracy_score, f1_score
print("accuracy:",round(accuracy_score(y_test, pat_cls.predictions),4))
print("f1 score:",round(f1_score(y_test, pat_cls.predictions,average='macro'),4))

accuracy: 0.6778
f1 score: 0.4268


# Density based approach

In [2]:
def model_test_CV(X, y, cat_cols=None, method="standard", alpha=0.,
                  kde_bandwidth=1.0, kde_kernel='gaussian', kde_leaf_size=40,
                  kde_classwise=False, scale_density=True, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, random_state=seed, shuffle=True)
    Accuracy = []
    F1_score = []
    exec_time = []
    for train_index, test_index in tqdm(kf.split(X), total=kf.get_n_splits(), desc="k-fold"):
        pat_cls = fcalc.classifier.PatternClassifier(X[train_index], y[train_index], 
                                                     categorical=cat_cols, method=method,
                                                     alpha=alpha, kde_bandwidth=kde_bandwidth,
                                                     kde_kernel=kde_kernel, kde_leaf_size=kde_leaf_size,
                                                     kde_classwise=kde_classwise, scale_density=scale_density)
        start = time.time()
        pat_cls.predict(X[test_index])
        end = time.time()
        Accuracy.append(round(accuracy_score(y[test_index], pat_cls.predictions),4))
        F1_score.append(round(f1_score(y[test_index], pat_cls.predictions, average='macro'),4))
        exec_time.append(round(end-start, 4))
    
    Accuracy.append(np.mean(Accuracy)); F1_score.append(np.mean(F1_score)); exec_time.append(np.mean(exec_time))
    return pd.DataFrame(zip(Accuracy,F1_score,exec_time),
                        columns=["Accuracy","F1 score", "time (s)"],
                        index=[x+1 for x in range(kf.get_n_splits())]+["mean"])

In [3]:
winered_df = pd.read_csv("data_sets/winequality-red.csv",sep=";")

In [9]:
X_train = winered_df.drop("quality",axis=1).values
y_train = winered_df["quality"].values
methods = ["standard", "standard-support", "ratio-support", "density-based"]
result = []
for m in methods:
    res = model_test_CV(X_train,y_train,method=m, scale_density=False)
    result.append(res.loc["mean"].values)
result=pd.DataFrame(result, index=methods,columns=["Accuracy","F1 score", "time (s)"])
result.loc["density-based classwise"] = model_test_CV(X_train,y_train,
                                                      method="density-based",
                                                      kde_classwise=True,
                                                      scale_density=False).loc["mean"]
display(result)
result.to_csv("D:/University/masters thesis/csv results/red_wine_res.csv")

k-fold: 100%|██████████| 5/5 [03:28<00:00, 41.72s/it]
k-fold: 100%|██████████| 5/5 [03:28<00:00, 41.61s/it]
k-fold: 100%|██████████| 5/5 [03:27<00:00, 41.49s/it]
k-fold: 100%|██████████| 5/5 [03:26<00:00, 41.38s/it]
k-fold: 100%|██████████| 5/5 [03:32<00:00, 42.45s/it]


Unnamed: 0,Accuracy,F1 score,time (s)
standard,0.37026,0.23404,41.71646
standard-support,0.25202,0.21758,41.60468
ratio-support,0.11254,0.12824,41.48826
density-based,0.3308,0.25058,41.37926
density-based classwise,0.25264,0.22812,42.44556


In [4]:
df = pd.read_csv('data_sets/iris.data', names=['sepal_length',	'sepal_width',	'petal_length',	'petal_width','species'])
le = LabelEncoder(); le.fit(df["species"].values)
df["species"] = le.transform(df["species"].values)

In [6]:
X_train = df.drop('species', axis=1).values
y_train = df['species'].values
methods = ["standard", "standard-support", "ratio-support", "density-based"]
result = []
for m in methods:
    res = model_test_CV(X_train,y_train,method=m, scale_density=False)
    result.append(res.loc["mean"].values)
result=pd.DataFrame(result, index=methods,columns=["Accuracy","F1 score", "time (s)"])
result.loc["density-based classwise"] = model_test_CV(X_train,y_train,
                                                      method="density-based",
                                                      kde_classwise=True,
                                                      scale_density=False).loc["mean"]
display(result)
result.to_csv("D:/University/masters thesis/csv results/irid_res.csv")

k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.71it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.67it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.45it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.88it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.18it/s]


Unnamed: 0,Accuracy,F1 score,time (s)
standard,0.91998,0.75474,0.112
standard-support,0.86,0.86172,0.113
ratio-support,0.94666,0.94918,0.1152
density-based,0.87332,0.87416,0.10982
density-based classwise,0.87332,0.86908,0.1188


In [6]:
X_train = winered_df.drop("quality",axis=1).values
y_train = winered_df["quality"].values
kernels = ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
result = []
for k in kernels:
    res = model_test_CV(X_train,y_train,method="density-based",kde_kernel=k)
    result.append(res.loc["mean"].values)
result=pd.DataFrame(result, index=kernels,columns=["Accuracy","F1 score", "time (sec.)"])
display(result)
result.to_csv("D:/University/masters thesis/csv results/red_wine_kernels_res.csv")

# result = []
# for k in kernels:
#     res = model_test_CV(X_train,y_train,method="density-based",kde_kernel=k, kde_classwise=True)
#     result.append(res.loc["mean"].values)
# result=pd.DataFrame(result, index=kernels,columns=["Accuracy","F1 score", "time (sec.)"])
# display(result)
# result.to_csv("D:/University/masters thesis/csv results/red_wine_kernels_classwise_res.csv")

k-fold: 100%|██████████| 5/5 [03:28<00:00, 41.78s/it]
k-fold: 100%|██████████| 5/5 [03:34<00:00, 42.91s/it]
k-fold: 100%|██████████| 5/5 [03:33<00:00, 42.75s/it]
k-fold: 100%|██████████| 5/5 [03:27<00:00, 41.40s/it]
k-fold: 100%|██████████| 5/5 [03:26<00:00, 41.32s/it]
k-fold: 100%|██████████| 5/5 [03:26<00:00, 41.20s/it]


Unnamed: 0,Accuracy,F1 score,time (sec.)
gaussian,0.25828,0.2254,41.77786
tophat,0.34204,0.2131,42.90964
epanechnikov,0.34956,0.25184,42.74644
exponential,0.25704,0.21696,41.40084
linear,0.37328,0.2434,41.31282
cosine,0.3558,0.25264,41.19982


In [33]:
X_train = df.drop('species', axis=1).values
y_train = df['species'].values
kernels = ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
result = []
for k in kernels:
    res = model_test_CV(X_train,y_train,method="density-based",kde_kernel=k)
    result.append(res.loc["mean"].values)
result=pd.DataFrame(result, index=kernels,columns=["Accuracy","F1 score", "time (sec.)"])
display(result)
result.to_csv("D:/University/masters thesis/csv results/iris_kernels_res.csv")

result = []
for k in kernels:
    res = model_test_CV(X_train,y_train,method="density-based",kde_kernel=k, kde_classwise=True)
    result.append(res.loc["mean"].values)
result=pd.DataFrame(result, index=kernels,columns=["Accuracy","F1 score", "time (sec.)"])
display(result)
result.to_csv("D:/University/masters thesis/csv results/iris_kernels_classwise_res.csv")

k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.50it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.67it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.73it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  7.95it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.58it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  6.63it/s]


Unnamed: 0,Accuracy,F1 score,time (s)
gaussian,0.88666,0.88028,0.1146
tophat,0.86664,0.86344,0.1128
epanechnikov,0.87332,0.86592,0.1114
exponential,0.88666,0.87864,0.123
linear,0.86664,0.86338,0.1136
cosine,0.33332,0.16506,0.1472


k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.42it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.06it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.96it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.94it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.86it/s]
k-fold: 100%|██████████| 5/5 [00:00<00:00,  8.93it/s]


Unnamed: 0,Accuracy,F1 score,time (s)
gaussian,0.88668,0.87854,0.116
tophat,0.88002,0.87108,0.1208
epanechnikov,0.88666,0.88028,0.109
exponential,0.9,0.89374,0.109
linear,0.89334,0.88856,0.11008
cosine,0.33332,0.16506,0.10918
