In [1]:
import fcalc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import time
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Pattern structures

## Categorical data

In [2]:
# Uploading data
column_names = [
        'top-left-square', 'top-middle-square', 'top-right-square',
        'middle-left-square', 'middle-middle-square', 'middle-right-square',
        'bottom-left-square', 'bottom-middle-square', 'bottom-right-square',
        'Class'
    ]
df = pd.read_csv('data_sets/tic-tac-toe.data', names = column_names)
df['Class'] = [x == 'positive' for x in df['Class']]
df.head()

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,Class
0,x,x,x,x,o,o,x,o,o,True
1,x,x,x,x,o,o,o,x,o,True
2,x,x,x,x,o,o,o,o,x,True
3,x,x,x,x,o,o,o,b,b,True
4,x,x,x,x,o,o,b,o,b,True


In [3]:
# Splitting the data into training and test
X = df[column_names[:-1]].values
y = df['Class'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Initializing classifier object without randomization
pat_cls = fcalc.classifier.PatternClassifier(context=X_train, labels=y_train, 
                                            categorical=range(X_train.shape[1]), # indices of columns with categorical features (all columns in this case)
                                            method="standard", randomize=False) # other methods: standard-support, ratio-support
# Computing support
pat_cls.compute_support(test=X_test)
# Predict the classes for test objects
pat_cls.predict(test=X_test)
# Results
print("Accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("F1-score:", round(f1_score(y_test, pat_cls.predictions, average='macro'), 4))

Accuracy: 0.9931
F1-score: 0.9921


In [18]:
# Initializing classifier object with randomization
pat_cls = fcalc.classifier.PatternClassifier(context=X_train, labels=y_train, 
                                            categorical=range(X_train.shape[1]), # indices of columns with categorical features (all columns in this case)
                                            method="standard", randomize=True, # other methods: standard-support, ratio-support
                                            num_iters=50, subsample_size=1) # subsample size should not exceed the size of the smallest class in training set
# Computing support
pat_cls.compute_support(test=X_test)
# Predict the classes for test objects
pat_cls.predict(test=X_test)
# Results
print("Accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("F1-score:", round(f1_score(y_test, pat_cls.predictions, average='macro'), 4))

Accuracy: 0.9132
F1-score: 0.6121


## Numerical data

In [23]:
# Uploading data
df = pd.read_csv('data_sets/iris.data', names=['sepal_length',	'sepal_width',	'petal_length',	'petal_width','species'])
# Encoding labels
df['species'] = LabelEncoder().fit_transform(df['species'])
df.sample(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
26,5.0,3.4,1.6,0.4,0
53,5.5,2.3,4.0,1.3,1
128,6.4,2.8,5.6,2.1,2
78,6.0,2.9,4.5,1.5,1
58,6.6,2.9,4.6,1.3,1


In [24]:
X = df.drop('species',axis=1).values
y = df['species'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
# Initializing classifier object without randomization
pat_cls = fcalc.classifier.PatternClassifier(context=X_train, labels=y_train, 
                                            method="standard-support", randomize=False) # other methods: standard, ratio-support
# Computing support
pat_cls.compute_support(test=X_test)
# Predict the classes for test objects
pat_cls.predict(test=X_test)
# Results
print("Accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("F1-score:", round(f1_score(y_test, pat_cls.predictions, average='macro'), 4))

Accuracy: 0.9778
F1-score: 0.9786


In [29]:
# Initializing classifier object with randomization
pat_cls = fcalc.classifier.PatternClassifier(context=X_train, labels=y_train, 
                                            method="standard-support", randomize=True,
                                            num_iters=5, subsample_size=4)
# Computing support
pat_cls.compute_support(test=X_test)
# Predict the classes for test objects
pat_cls.predict(test=X_test)
# Results
print("Accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("F1-score:", round(f1_score(y_test, pat_cls.predictions, average='macro'), 4))

Accuracy: 0.9556
F1-score: 0.9556


## Mixed data

In [2]:
df = pd.read_csv('data_sets/heart_failure_clinical_records_dataset.csv')
df.sample(5)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
111,55.0,0,60,0,35,0,228000.0,1.2,135,1,1,90,0
287,45.0,0,582,1,55,0,543000.0,1.0,132,0,0,250,0
66,42.0,1,250,1,15,0,213000.0,1.3,136,0,0,65,1
169,70.0,0,835,0,35,1,305000.0,0.8,133,0,0,145,0
97,70.0,1,59,0,60,0,255000.0,1.1,136,0,0,85,0


In [3]:
X = df.drop('DEATH_EVENT',axis=1).values
y = df['DEATH_EVENT'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# List of categorical columns
cat_cols = np.array([1, 3, 5, 9, 10])
# Initializing classifier object without randomization
pat_cls = fcalc.classifier.PatternClassifier(context=X_train, labels=y_train, 
                                            categorical=cat_cols, # indices of columns with categorical features (all columns in this case)
                                            method="standard", randomize=False) # other methods: standard-support, ratio-support
# Computing support
pat_cls.compute_support(test=X_test)
# Predict the classes for test objects
pat_cls.predict(test=X_test)
# Results
print("Accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("F1-score:", round(f1_score(y_test, pat_cls.predictions, average='macro'), 4))

Accuracy: 0.6778
F1-score: 0.4268


In [14]:
# List of categorical columns
cat_cols = np.array([1, 3, 5, 9, 10])
# Initializing classifier object with randomization
pat_cls = fcalc.classifier.PatternClassifier(context=X_train, labels=y_train, 
                                            categorical=cat_cols, # indices of columns with categorical features (all columns in this case)
                                            method="standard", randomize=True,
                                            num_iters=30, subsample_size=2) # other methods: standard-support, ratio-support
# Computing support
pat_cls.compute_support(test=X_test)
# Predict the classes for test objects
pat_cls.predict(test=X_test)
# Results
print("Accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("F1-score:", round(f1_score(y_test, pat_cls.predictions, average='macro'), 4))

Accuracy: 0.7444
F1-score: 0.6854
