In [1]:
import torch
import torch.nn as nn
import pandas as pd
import sklearn
import numpy as np
import torch.utils
from torch.utils.data import Dataset, DataLoader
import torch.utils.data
from tqdm.notebook import trange, tqdm
import pickle
import sys
import os
import math
from copy import deepcopy
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier as KNN


ModuleNotFoundError: No module named 'pandas'

In [165]:
class train_dataset(Dataset):
    def __init__(self, csv_file_feature, csv_meta, training_target):
        self.base_frame = pd.read_csv(csv_file_feature)

        self.meta_frame = pd.read_csv(csv_meta, dtype={'sample_name': 'int32',
                                                       'apoe4': 'string',
                                                       'bmi': 'float32',
                                                       'diagnosis': 'string',
                                                       'amlyoid_positive': 'float32',
                                                       'visit': 'int32'})
        self.target_frame = self.meta_frame[training_target]
        self.base_frame.drop(columns=['sample_name'])

    def __len__(self):
        return len(self.target_frame)

    def __getitem__(self, index):
        label = 0
        if self.target_frame[index] == 'positive':
            label = 1 # [1, 0]
        else:
            label = 0 # [0, 1]

        feature = self.base_frame.iloc[index]
        feature_tensor = torch.tensor(feature.values).float()

        label_tensor = torch.tensor(label).float()

        # return feature_tensor, label_tensor
        return feature, label


In [217]:
def binarize(base_matrix):
	return np.where(base_matrix < 0, -1, 1)

# def encoding_rp(X_data, base_matrix, signed=False):
#     enc_hvs = []
#     # print(len(X_data))
#     for i in range(len(X_data)):
#         if i % int(len(X_data)/20) == 0:
#             sys.stdout.write(str(int(i/len(X_data)*100)) + '% ')
#             sys.stdout.flush()
#         hv = np.matmul(base_matrix, X_data[i])
#         if signed:
#             hv = binarize(hv)
#         enc_hvs.append(hv)
#     return enc_hvs

def encoding_rp(X_data, base_matrix, signed=False):
    enc_hvs = []
    step = max(1, int(len(X_data) / 20))
    for i in range(len(X_data)):
        if i % step == 0:
            sys.stdout.write(str(int(i / len(X_data) * 100)) + '% ')
            sys.stdout.flush()
        # print(f"\nDebug: base_matrix shape: {base_matrix.shape}, X_data[{i}] shape: {X_data[i].shape}")
        hv = np.matmul(base_matrix, X_data[i])
        if signed:
            hv = binarize(hv)
        enc_hvs.append(hv)
    return enc_hvs


def encoding_idlv(X_data, lvl_hvs, id_hvs, D, bin_len, x_min, L=64):
    enc_hvs = []
    step = max(1, int(len(X_data) / 20))
    for i in range(len(X_data)):
        if i == int(len(X_data)/1):
            break
            
        if i % step == 0:
            sys.stdout.write(str(int(i/len(X_data)*100)) + '% ')
            sys.stdout.flush()
        sum_ = np.array([0] * D)
        for j in range(len(X_data[i])):
            # bin_ = min( np.round((X_data[i][j] - x_min)/bin_len), L-1)
            bin_ = min( np.floor((X_data[i][j] - x_min)/bin_len), L-1)
            bin_ = int(bin_)
            sum_ += lvl_hvs[bin_]*id_hvs[j]
        enc_hvs.append(sum_)
    return enc_hvs

def encoding_perm(X_data, lvl_hvs, D, bin_len, x_min, L=64):
    enc_hvs = []
    step = max(1, int(len(X_data) / 20))
    for i in range(len(X_data)):
        if i % step == 0:
            sys.stdout.write(str(int(i/len(X_data)*100)) + '% ')
            sys.stdout.flush()
        sum_ = np.array([0] * D)
        for j in range(len(X_data[i])):
            # bin_ = min( np.round((X_data[i][j] - x_min)/bin_len), L-1)
            bin_ = min( np.floor((X_data[i][j] - x_min)/bin_len), L-1)
            bin_ = int(bin_)
            sum_ += np.roll(lvl_hvs[bin_], j)
        enc_hvs.append(sum_)
    return enc_hvs

def max_match(class_hvs, enc_hv, class_norms):
		max_score = -np.inf
		max_index = -1
		for i in range(len(class_hvs)):
			score = np.matmul(class_hvs[i], enc_hv) / class_norms[i]
			#score = np.matmul(class_hvs[i], enc_hv)
			if score > max_score:
				max_score = score
				max_index = i
		return max_index

In [226]:
def train_model(training_data, testing_data, device, D=500, alg='idlv', epoch=20, lr=1.0, L=64):
    training_features = [data[0] for data in training_data]
    training_labels = [data[1]for data in training_data]
    # print("labels: ", training_labels)    
    # training_labels = training_labels.tolist()
    # print("labels: ", training_labels)  
    
    testing_features = [data[0] for data in testing_data]
    testing_labels = [data[1] for data in testing_data]
    len_train_f = len(training_features)
    cnt_vld = int(0.2 * len_train_f)
    
    validation_features = training_features[0:cnt_vld]
    validation_labels = training_labels[0:cnt_vld]
    training_features = training_features[cnt_vld:len_train_f-1]
    training_labels = training_labels[cnt_vld:len_train_f-1]
    # i = 0
    # for eacel in training_features:
    #     print("feature ", i, " : ", eacel)
    #     i+=1
    if alg in ['rp', 'rp-sign']:
        #create base matrix
        base_matrix = np.random.rand(D, len(training_features[0]))
        base_matrix = np.where(base_matrix > 0.5, 1, -1)
        base_matrix = np.array(base_matrix, np.int8)
        print('\nEncoding ' + str(len_train_f) + ' train data features')
        train_enc_hvs = encoding_rp(training_features, base_matrix, signed=(alg == 'rp-sign'))
        print('\n\nEncoding ' + str(len(validation_features)) + ' features validation data')
        validation_enc_hvs = encoding_rp(validation_features, base_matrix, signed=(alg == 'rp-sign'))
    elif alg in ['idlv', 'perm']:
        #create level matrix
        lvl_hvs = []
        temp = [-1]*int(D/2) + [1]*int(D/2)
        np.random.shuffle(temp)
        lvl_hvs.append(temp)
        change_list = np.arange(0, D)
        np.random.shuffle(change_list)
        cnt_toChange = int(D/2 / (L-1))
        for i in range(1, L):
            temp = np.array(lvl_hvs[i-1])
            temp[change_list[(i-1)*cnt_toChange : i*cnt_toChange]] = -temp[change_list[(i-1)*cnt_toChange : i*cnt_toChange]]
            lvl_hvs.append(list(temp))
        lvl_hvs = np.array(lvl_hvs, dtype=np.int8)
        x_min = min( np.min(training_features), np.min(validation_features) )
        x_max = max( np.max(training_features), np.max(validation_features) )
        bin_len = (x_max - x_min)/float(L)
        
        #need to create id hypervectors if encoding is level-id
        if alg == 'idlv':
            cnt_id = len(training_features[0])
            id_hvs = []
            for i in range(cnt_id):
                temp = [-1]*int(D/2) + [1]*int(D/2)
                np.random.shuffle(temp)
                id_hvs.append(temp)
            id_hvs = np.array(id_hvs, dtype=np.int8)
            print('\nEncoding ' + str(len_train_f) + ' train data features')
            train_enc_hvs = encoding_idlv(training_features, lvl_hvs, id_hvs, D, bin_len, x_min, L)
            print('\n\nEncoding ' + str(len(validation_features)) + ' features validation data')
            validation_enc_hvs = encoding_idlv(validation_features, lvl_hvs, id_hvs, D, bin_len, x_min, L)
        elif alg == 'perm':
            print('\nEncoding ' + str(len_train_f) + ' train data features')
            train_enc_hvs = encoding_perm(training_features, lvl_hvs, D, bin_len, x_min, L)
            print('\n\nEncoding ' + str(len(validation_features)) + ' features validation data')
            validation_enc_hvs = encoding_perm(validation_features, lvl_hvs, D, bin_len, x_min, L)
	
    #training, initial model
    class_hvs = [[0.] * D] * (max(training_labels) + 1)
    for i in range(len(train_enc_hvs)):
        class_hvs[training_labels[i]] += train_enc_hvs[i]
    class_norms = [np.linalg.norm(hv) for hv in class_hvs]
    class_hvs_best = deepcopy(class_hvs)
    class_norms_best = deepcopy(class_norms)
    #retraining
    if epoch > 0:
        acc_max = -np.inf
        print('\n\n' + str(epoch) + ' retraining epochs')
        for i in range(epoch):
            sys.stdout.write('epoch ' + str(i) + ': ')
            sys.stdout.flush()
            #shuffle data during retraining
            pickList = np.arange(0, len(train_enc_hvs))
            np.random.shuffle(pickList)
            for j in pickList:
                predict = max_match(class_hvs, train_enc_hvs[j], class_norms)
                if predict != training_labels[j]:
                    class_hvs[predict] -= np.multiply(lr, train_enc_hvs[j])
                    class_hvs[training_labels[j]] += np.multiply(lr, train_enc_hvs[j])
            class_norms = [np.linalg.norm(hv) for hv in class_hvs]
            correct = 0
            for j in range(len(validation_enc_hvs)):
                predict = max_match(class_hvs, validation_enc_hvs[j], class_norms)
                if predict == validation_labels[j]:
                    correct += 1
            acc = float(correct)/len(validation_enc_hvs)
            sys.stdout.write("%.4f \n" %acc) 
            sys.stdout.flush()
            if i > 0 and i%5 == 0:
                print('')
            if acc > acc_max:
                acc_max = acc
                class_hvs_best = deepcopy(class_hvs)
                class_norms_best = deepcopy(class_norms)
   
    del training_features
    del validation_features
    del train_enc_hvs
    del validation_enc_hvs 
    
    print('\n\nEncoding ' + str(len(testing_features)) + ' test data features')
    if alg == 'rp' or alg == 'rp-sign':
        test_enc_hvs = encoding_rp(testing_features, base_matrix, signed=(alg == 'rp-sign'))
    elif alg == 'idlv':
        test_enc_hvs = encoding_idlv(testing_features, lvl_hvs, id_hvs, D, bin_len, x_min, L)
    elif alg == 'perm':
            test_enc_hvs = encoding_perm(testing_features, lvl_hvs, D, bin_len, x_min, L)
    correct = 0
    for i in range(len(test_enc_hvs)):
        predict = max_match(class_hvs_best, test_enc_hvs[i], class_norms_best)
        if predict == testing_labels[i]:
            correct += 1
    acc = float(correct)/len(test_enc_hvs)
    return acc
    

In [227]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(pd.__version__)
print(sklearn.__version__)

train_data = train_dataset('train_data.csv', 'Meta.csv', 'apoe4')


train_size = int(0.70 * len(train_data))
test_size = len(train_data) - train_size

training_data, testing_data = torch.utils.data.random_split(train_data, [train_size, test_size])
training_dataloader = DataLoader(training_data, batch_size=8, shuffle=True)
testing_dataloader = DataLoader(testing_data, batch_size=8, shuffle=True)
# for el in training_dataloader:
#     print(el)

train_model(training_data, testing_data, device)


True
NVIDIA A100-SXM4-80GB
2.2.2
1.3.2

Encoding 81 train data features
0% 4% 9% 14% 18% 23% 28% 32% 

  bin_ = min( np.floor((X_data[i][j] - x_min)/bin_len), L-1)


37% 42% 46% 51% 56% 60% 65% 70% 75% 79% 84% 89% 93% 98% 

Encoding 16 features validation data
0% 6% 12% 18% 25% 31% 37% 43% 50% 56% 62% 68% 75% 81% 87% 93% 

20 retraining epochs
epoch 0: 0.5625 
epoch 1: 0.5625 
epoch 2: 0.6250 
epoch 3: 0.6250 
epoch 4: 0.6250 
epoch 5: 0.6250 

epoch 6: 0.6875 
epoch 7: 0.6875 
epoch 8: 0.6875 
epoch 9: 0.6875 
epoch 10: 0.6875 

epoch 11: 0.6875 
epoch 12: 0.6875 
epoch 13: 0.6875 
epoch 14: 0.6875 
epoch 15: 0.6875 

epoch 16: 0.6875 
epoch 17: 0.6875 
epoch 18: 0.6875 
epoch 19: 0.6875 


Encoding 35 test data features
0% 2% 5% 8% 11% 14% 17% 20% 22% 25% 28% 31% 34% 37% 40% 42% 45% 48% 51% 54% 57% 60% 62% 65% 68% 71% 74% 77% 80% 82% 85% 88% 91% 94% 97% 

0.5714285714285714

In [1]:

# re:
# training: 0.62~0.75
# testing: 0.6~0.628
# idlv:
# training: 0.5...~0.68
# testing: 0.6
# perm: 
# training eventually stuck at 0.6875~0.7500
# testing: always 0.54~0.71

# --------
# /tmp/ipykernel_260/1580132966.py:61: 
# FutureWarning: 
# Series.__getitem__ treating keys as positions is deprecated. 
# In a future version, integer keys will always be treated as labels 
# (consistent with DataFrame behavior). 
# To access a value by position, use `ser.iloc[pos]`

#   bin_ = min( np.floor((X_data[i][j] - x_min)/bin_len), L-1)


In [2]:
# for ele in training_dataloader:
#     print(ele)