This Jupyter Notebook implements a multi-layer perceptron (MLP) classifier using Scikit-learn to classify data linked to A8, D8, and E8 algebras. It imports and organizes invariant coefficient vectors, removes duplicates to create unique datasets, and combines these unique datasets for machine learning. The notebook performs k-fold cross-validation, trains an MLP classifier, and assesses its performance. Three classifiers are trained to differentiate between the invariants specific to one of the three algebras and those associated with the other two, as well as distinguishing them from the fake invariants.

In [1]:
import numpy as np
from itertools import chain
from math import floor
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import LearningCurveDisplay, learning_curve

import matplotlib.pyplot as plt
from functools import reduce
import time
from tqdm import tqdm

In [2]:
# Import data -- in format 
# [[(permutation order of roots in W definition), [list of invariant coefficient vectors], ...all permutations]

# Modified files are used: all 1/2, 3/2, 5/2 are replaced by 0.5, 1.5, 2,5

# Extract invarinats' components into a list 
def parseString(string):
    '''Extract permutation as a string and invariants' components as an array'''
    tmp = list(string[1:-2].split(", [["))
    return [ [float(i) for i in list(elem.split(", "))]  for elem in list(tmp[1][:-1].split("], ["))]

dataSize = 40320

start = time.time()

# Create np array for storing permutations and invariants' components
data_A8 = np.zeros((dataSize, 9*256), dtype=np.float16)
data_D8 = np.zeros((dataSize, 9*256), dtype=np.float16)
data_E8 = np.zeros((dataSize, 9*256), dtype=np.float16)

# Read data from files, concatenate each invarinat's components
lnIdx = 0
with open('ADE_Data\A8inv_Data_mod.txt','r') as file:
    for line in file:
        data_A8[lnIdx] = np.concatenate(parseString(line.rstrip()))
        lnIdx = lnIdx + 1
        
lnIdx = 0
with open('ADE_Data\D8inv_Data_mod.txt','r') as file:
    for line in file:
        data_D8[lnIdx] = np.concatenate(parseString(line.rstrip()))
        lnIdx = lnIdx + 1

lnIdx = 0
with open('ADE_Data\E8inv_Data_mod.txt','r') as file:
    for line in file:
        data_E8[lnIdx] = np.concatenate(parseString(line.rstrip()))
        lnIdx = lnIdx + 1
        
end = time.time()
print(end - start)

43.337714433670044


In [3]:
start = time.time()

fakeDataSize = 40000

# Parse string into list of lists of float
def parseStringFake(string):
    return list(string[1:-1].split(", "))

# Import fake data
fakeData_A8 = np.loadtxt('ADE_Data\A8inv_Data_Fake.txt',delimiter=',',)
fakeData_D8 = np.loadtxt('ADE_Data\D8inv_Data_Fake.txt',delimiter=',',)
fakeData_E8 = np.loadtxt('ADE_Data\E8inv_Data_Fake.txt',delimiter=',',)
    

end = time.time()
print(end - start)

print(np.shape(fakeData_A8))

26.029237747192383
(40000, 2304)


In [4]:
# Remove duplicates from data to create "unique" A8/D8/E8 datasets

# A8
dataSize_unique = 128
data_A8_unique = np.zeros((dataSize_unique, 9*256), dtype=np.float16)

idx_old = 0
idx_new = 0
while (idx_old<dataSize) and (idx_new<dataSize_unique):
    if not (data_A8[idx_old] == data_A8_unique).all(1).any():
        data_A8_unique[idx_new] = data_A8[idx_old]
        idx_new = idx_new + 1
    idx_old = idx_old + 1
    
# E8
data_E8_unique = np.zeros((dataSize_unique, 9*256), dtype=np.float16)

idx_old = 0
idx_new = 0
while (idx_old<dataSize) and (idx_new<dataSize_unique):
    if not (data_E8[idx_old] == data_E8_unique).all(1).any():
        data_E8_unique[idx_new] = data_E8[idx_old]
        idx_new = idx_new + 1
    idx_old = idx_old + 1
 
# D8
data_D8_unique = np.zeros((dataSize_unique, 9*256), dtype=np.float16)

idx_old = 0
idx_new = 0
while (idx_old<dataSize) and (idx_new<dataSize_unique):
    if not (data_D8[idx_old] == data_D8_unique).all(1).any():
        data_D8_unique[idx_new] = data_D8[idx_old]
        idx_new = idx_new + 1
    idx_old = idx_old + 1

In [5]:
# Check for duplicates

# data_A8_unique
duplicates = np.zeros(dataSize_unique, dtype=int)

for idx1 in tqdm(range(dataSize_unique)):
    if duplicates[idx1] == 1:
            continue
    for idx2 in range(idx1+1,dataSize_unique):
        if duplicates[idx2] == 1:
            continue
        if (data_A8_unique[idx1]==data_A8_unique[idx2]).all():
            duplicates[idx2] = 1

print(sum(duplicates))

# data_D8_unique
duplicates = np.zeros(dataSize_unique, dtype=int)

for idx1 in tqdm(range(dataSize_unique)):
    if duplicates[idx1] == 1:
            continue
    for idx2 in range(idx1+1,dataSize_unique):
        if duplicates[idx2] == 1:
            continue
        if (data_D8_unique[idx1]==data_D8_unique[idx2]).all():
            duplicates[idx2] = 1

print(sum(duplicates))

# data_E8_unique
duplicates = np.zeros(dataSize_unique, dtype=int)

for idx1 in tqdm(range(dataSize_unique)):
    if duplicates[idx1] == 1:
            continue
    for idx2 in range(idx1+1,dataSize_unique):
        if duplicates[idx2] == 1:
            continue
        if (data_E8_unique[idx1]==data_E8_unique[idx2]).all():
            duplicates[idx2] = 1

print(sum(duplicates))

100%|██████████| 128/128 [00:00<00:00, 2033.12it/s]


0


100%|██████████| 128/128 [00:00<00:00, 2730.53it/s]


0


100%|██████████| 128/128 [00:00<00:00, 2012.99it/s]


0


In [6]:
# Concatenate invariants component only, put them in the list and output data together

# Dataset for E8 classification
preML_data_A8 = [[data_A8_unique[index], 0] for index in range(dataSize_unique)]
preML_data_D8 = [[data_D8_unique[index], 0] for index in range(dataSize_unique)]
preML_data_E8 = [[data_E8_unique[index], 1] for index in range(dataSize_unique)]

preML_data_FakeA8 = [[fakeData_A8[index], 0] for index in range(fakeDataSize)]
preML_data_FakeD8 = [[fakeData_D8[index], 0] for index in range(fakeDataSize)]
preML_data_FakeE8 = [[fakeData_E8[index], 0] for index in range(fakeDataSize)]

preML_data_Fake = preML_data_FakeA8 + preML_data_FakeD8 + preML_data_FakeE8

In [7]:
def TrainTestSplit(k, *data):
    '''Split each of the data entries into train and test sets in proportion (k-1) to 1. 
    Concatenate test parts and train parts among themselves. Return shuffled train and test datasets.
    k - number of k-fold cross-validations to perform
    Input: k - number of k-fold cross-validations to perform, arbitrary number of lists containing pairs (invariant components, label)
    Output: input dataset for ML training, corrsponding output dataset, input dataset for ML test, corrsponding output dataset. 
    Train and test datasets have sizes as (k-1) to 1
    '''

    #Shuffle data ordering
    for dataset in data:
        np.random.shuffle(dataset)

    # Unique A8/D8/E8 datasets are much smaller than their "Fakes". So we split Unique datasets into test/ train datasets manually to ensure there
    # are enough datapoints from Unique datasets in test and train data

    # Define data lists, each with k sublists with the relevant data for train/test
    ML_data_train = []
    ML_data_test = []
    
    for i in range(k):
        preML_data_train = []
        preML_data_test = []
        
        # Split each dataset into train and test parts
        for dataset in data:
            s = int(floor(len(dataset)/k)) #...number of datapoints in  validation split
            preML_data_train.append( [elem for elem in dataset[:i*s]]+[elem for elem in dataset[(i+1)*s:]] )
            preML_data_test.append( [elem for elem in dataset[i*s:(i+1)*s]] )

        # Combine manually chosen portions of Fake and Original datasets into train and test datasets
        ML_data_train.append( [item for sublist in preML_data_train for item in sublist] ) # ... equivalent to ML_data_train.append( preML_data_train[0] + preML_data_train[1] + ... )
        # double unique training data experiment
        ML_data_test.append( [item for sublist in preML_data_test for item in sublist] ) # ... equivalent to ML_data_test.append( preML_data_test[0] + preML_data_test[1] + ... )

        # Shuffle data ordering
        np.random.shuffle(ML_data_train[-1])
        np.random.shuffle(ML_data_test[-1])
        
        del(preML_data_train, preML_data_test)
    
    #Define data lists, each with k sublists with the relevant data for training and cross-validation
    Train_inputs, Train_outputs, Test_inputs, Test_outputs = [], [], [], []

    for i in range(k):
        Train_inputs.append([datapoint[0] for datapoint in ML_data_train[i]])
        Train_outputs.append([datapoint[1] for datapoint in ML_data_train[i]])
        Test_inputs.append([datapoint[0] for datapoint in ML_data_test[i]])
        Test_outputs.append([datapoint[1] for datapoint in ML_data_test[i]])

    del(ML_data_train, ML_data_test) # data no longer needed
    
    return Train_inputs, Train_outputs, Test_inputs, Test_outputs

In [8]:
k = 4   #...number of k-fold cross-validations to perform (k = 5 => 80(train) : 20(test) splits approx.)
Train_inputs, Train_outputs, Test_inputs, Test_outputs = TrainTestSplit(k, preML_data_A8, preML_data_D8, preML_data_E8, preML_data_Fake)

In [11]:
# Run NN train & test
# Define measure lists
F1s, ACCs, PRECs, RECs = [], [], [], []    #...lists of measures
seed = 1                                   #...select a random seeding (any integer) for regressor initialisation

#Loop through each cross-validation run
for i in range(k):
    #Define & Train NN Regressor directly on the data
    nn_clf = MLPClassifier((256,), activation='relu', solver='adam', alpha=0.001, n_iter_no_change=5, random_state=seed)  #...can edit the NN structure here
    nn_clf.fit(Train_inputs[i], Train_outputs[i]) 
    
    #Compute NN predictions on test data, and calculate learning measures
    Test_pred = nn_clf.predict(Test_inputs[i])
    F1s.append(f1_score(Test_outputs[i], Test_pred))
    ACCs.append(accuracy_score(Test_outputs[i], Test_pred))
    PRECs.append(precision_score(Test_outputs[i], Test_pred))
    RECs.append(recall_score(Test_outputs[i], Test_pred))
    
    #plot(nn_clf.loss_curve_)
    plt.show()
                
# Averaged output learning measures
print('####################################')
print('Measures for E8 classification:')
print('Accuracy: ',sum(ACCs)/k,'\pm',np.std(ACCs)/np.sqrt(k))
print('Recall: ',sum(RECs)/k,'\pm',np.std(RECs)/np.sqrt(k))
print('Precision: ',sum(PRECs)/k,'\pm',np.std(PRECs)/np.sqrt(k))
print('F1: ',sum(F1s)/k,'\pm',np.std(F1s)/np.sqrt(k))

####################################
Measures for E8 classification:
Accuracy:  0.9996428096757044 \pm 3.596929009604529e-05
Recall:  0.671875 \pm 0.0390625
Precision:  0.9903846153846154 \pm 0.008327167344081135
F1:  0.7973341526301838 \pm 0.025618069821102398


In [12]:
# CLear space for other classifiers
del(nn_clf, Train_inputs, Train_outputs, Test_inputs, Test_outputs, Test_pred, F1s, ACCs, RECs, PRECs)

In [13]:
# Concatenate invariants component only, put them in the list and output data together

# Dataset for D8 classification
preML_data_A8 = [[data_A8_unique[index], 0] for index in range(dataSize_unique)]
preML_data_D8 = [[data_D8_unique[index], 1] for index in range(dataSize_unique)]
preML_data_E8 = [[data_E8_unique[index], 0] for index in range(dataSize_unique)]

preML_data_FakeA8 = [[fakeData_A8[index], 0] for index in range(fakeDataSize)]
preML_data_FakeD8 = [[fakeData_D8[index], 0] for index in range(fakeDataSize)]
preML_data_FakeE8 = [[fakeData_E8[index], 0] for index in range(fakeDataSize)]

preML_data_Fake = preML_data_FakeA8 + preML_data_FakeD8 + preML_data_FakeE8

In [14]:
k = 5   #...number of k-fold cross-validations to perform (k = 5 => 80(train) : 20(test) splits approx.)
Train_inputs, Train_outputs, Test_inputs, Test_outputs = TrainTestSplit(k, preML_data_A8, preML_data_D8, preML_data_E8, preML_data_Fake)

In [15]:
# Run NN train & test
# Define measure lists
F1s, ACCs, PRECs, RECs = [], [], [], []    #...lists of measures
seed = 1                          #...select a random seeding (any integer) for regressor initialisation

#Loop through each cross-validation run
for i in range(k):
    #Define & Train NN Regressor directly on the data
    nn_clf = MLPClassifier((256,64), activation='relu', solver='adam', alpha=0.001, n_iter_no_change=5, random_state=seed)  #...can edit the NN structure here
    nn_clf.fit(Train_inputs[i], Train_outputs[i]) 

    #Compute NN predictions on test data, and calculate learning measures
    Test_pred = nn_clf.predict(Test_inputs[i])
    F1s.append(f1_score(Test_outputs[i], Test_pred))
    ACCs.append(accuracy_score(Test_outputs[i], Test_pred))
    PRECs.append(precision_score(Test_outputs[i], Test_pred))
    RECs.append(recall_score(Test_outputs[i], Test_pred))
                
# Averaged output learning measures
print('####################################')
print('Measures for D8 classification:')
print('Accuracy: ',sum(ACCs)/k,'\pm',np.std(ACCs)/np.sqrt(k))
print('Recall: ',sum(RECs)/k,'\pm',np.std(RECs)/np.sqrt(k))
print('Precision: ',sum(PRECs)/k,'\pm',np.std(PRECs)/np.sqrt(k))
print('F1: ',sum(F1s)/k,'\pm',np.std(F1s)/np.sqrt(k))

####################################
Measures for D8 classification:
Accuracy:  0.9996593977154726 \pm 5.0395018527446654e-05
Recall:  0.712 \pm 0.028621670111997306
Precision:  0.95 \pm 0.03464101615137754
F1:  0.8124283103296148 \pm 0.027401256341682823


In [16]:
# CLear space for other classifiers
del(nn_clf, Train_inputs, Train_outputs, Test_inputs, Test_outputs, Test_pred, F1s, ACCs, RECs, PRECs)

In [17]:
# Concatenate invariants component only, put them in the list and output data together

# Dataset for A8 classification
preML_data_A8 = [[data_A8_unique[index], 1] for index in range(dataSize_unique)]
preML_data_D8 = [[data_D8_unique[index], 0] for index in range(dataSize_unique)]
preML_data_E8 = [[data_E8_unique[index], 0] for index in range(dataSize_unique)]

preML_data_FakeA8 = [[fakeData_A8[index], 0] for index in range(fakeDataSize)]
preML_data_FakeD8 = [[fakeData_D8[index], 0] for index in range(fakeDataSize)]
preML_data_FakeE8 = [[fakeData_E8[index], 0] for index in range(fakeDataSize)]

preML_data_Fake = preML_data_FakeA8 + preML_data_FakeD8 + preML_data_FakeE8

In [18]:
k = 5   #...number of k-fold cross-validations to perform (k = 5 => 80(train) : 20(test) splits approx.)
Train_inputs, Train_outputs, Test_inputs, Test_outputs = TrainTestSplit(k, preML_data_A8, preML_data_D8, preML_data_E8, preML_data_Fake)

In [19]:
# Run NN train & test
# Define measure lists
F1s, ACCs, PRECs, RECs = [], [], [], []    #...lists of measures
seed = 1                          #...select a random seeding (any integer) for regressor initialisation

#Loop through each cross-validation run
for i in range(k):
    #Define & Train NN Regressor directly on the data
    nn_clf = MLPClassifier((256,64), activation='relu', solver='adam', alpha=0.001, n_iter_no_change=5, random_state=seed)  #...can edit the NN structure here
    nn_clf.fit(Train_inputs[i], Train_outputs[i]) 

    #Compute NN predictions on test data, and calculate learning measures
    Test_pred = nn_clf.predict(Test_inputs[i])
    F1s.append(f1_score(Test_outputs[i], Test_pred))
    ACCs.append(accuracy_score(Test_outputs[i], Test_pred))
    PRECs.append(precision_score(Test_outputs[i], Test_pred))
    RECs.append(recall_score(Test_outputs[i], Test_pred))
                
# Averaged output learning measures
print('####################################')
print('Measures for A8 classification:')
print('Accuracy: ',sum(ACCs)/k,'\pm',np.std(ACCs)/np.sqrt(k))
print('Recall: ',sum(RECs)/k,'\pm',np.std(RECs)/np.sqrt(k))
print('Precision: ',sum(PRECs)/k,'\pm',np.std(PRECs)/np.sqrt(k))
print('F1: ',sum(F1s)/k,'\pm',np.std(F1s)/np.sqrt(k))

####################################
Measures for A8 classification:
Accuracy:  0.9994434060228452 \pm 4.170268045030241e-05
Recall:  0.52 \pm 0.046647615158762396
Precision:  0.910448717948718 \pm 0.026773638992573796
F1:  0.6542861989075083 \pm 0.033215009131912686


In [20]:
# CLear space for other classifiers
del(nn_clf, Train_inputs, Train_outputs, Test_inputs, Test_outputs, Test_pred, F1s, ACCs, RECs, PRECs)