In [2]:
import torch
import os
from torchvision import transforms

In [3]:
train_dir = os.path.join('dataset', 'part_one_dataset', 'train_data')
eval_dir = os.path.join('dataset', 'part_one_dataset', 'eval_data')

In [9]:
domains = [{} for _ in range(10)]

for j in range(10):
    
    train_path = os.path.join(train_dir, f'{j+1}_train_data.tar.pth')
    t = torch.load(train_path, weights_only = False)

    data = t['data'] # both numpy.ndarray
    
    domains[j]['labels'] = t['targets'] if 'targets' in t else None
    domains[j]['features'] = data

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, manhattan_distances

class LWP:
    """Learning Vector Prototypes with configurable distance function"""
    
    DISTANCE_FUNCTIONS = {
        'euclidean': lambda x, y: np.linalg.norm(x - y),
        'cosine': lambda x, y: cosine_distances(x.reshape(1, -1), y.reshape(1, -1))[0][0],
        'manhattan': lambda x, y: manhattan_distances(x.reshape(1, -1), y.reshape(1, -1))[0][0],
        'minkowski': lambda x, y, p=2: np.power(np.sum(np.power(np.abs(x - y), p)), 1/p)
    }
    
    def __init__(self, distance_metric='euclidean', **distance_params):
        """
            distance_params (dict): Additional parameters for the distance function
        """
        self.prototypes = {}
        self.class_counts = {i: 0 for i in range(10)}
        
        if callable(distance_metric):
            self.distance_fn = distance_metric
        elif distance_metric in self.DISTANCE_FUNCTIONS:
            if distance_metric == 'minkowski':
                p = distance_params.get('p', 2)
                self.distance_fn = lambda x, y: self.DISTANCE_FUNCTIONS[distance_metric](x, y, p)
            else:
                self.distance_fn = self.DISTANCE_FUNCTIONS[distance_metric]
        else:
            raise ValueError(f"Unknown distance metric: {distance_metric}. " 
                           f"Available metrics: {list(self.DISTANCE_FUNCTIONS.keys())}")

    def fit(self, features, labels):
        unique_labels = np.unique(labels)
        for label in unique_labels:
            samples = features[labels == label]
            num_samples = len(samples)
            
            if label not in self.prototypes:  # Original condition was: if label not in self.prototypes
                self.prototypes[label] = samples.mean(axis=0)
                self.class_counts[label] = len(samples)
            else:
                self.class_counts[label] += len(samples)
                self.prototypes[label] = (
                    (self.class_counts[label] - num_samples) / self.class_counts[label] * self.prototypes[label] +
                    num_samples / self.class_counts[label] * samples.mean(axis=0)
                )

    def predict(self, features):
        preds = []
        for feature in features:
            distances = {
                label: self.distance_fn(feature, proto)
                for label, proto in self.prototypes.items()
            }
            preds.append(min(distances, key=distances.get))
        return np.array(preds)

## Get Eval Data
Evaluating on trainset for now

In [25]:
eval_domains = [{} for _ in range(10)]

for j in range(10):
    
    eval_path = os.path.join(eval_dir, f'{j+1}_eval_data.tar.pth')
    t = torch.load(eval_path, weights_only = False)

    data = t['data'] # both numpy.ndarray
    
    eval_domains[j]['labels'] = t['targets'] if 'targets' in t else None
    eval_domains[j]['features'] = data

In [28]:
from sklearn.metrics import accuracy_score
import pandas as pd

model = LWP(distance_metric='cosine')

df = pd.DataFrame()

for idx,domain in enumerate(domains):
    
    x_test = domain['features']
    y_pred = model.predict(x_test) if domain['labels'] is None else domain['labels']
    
    model.fit(x_test, y_pred)
    print(model.class_counts)
    
    scores = []
    for eval_domain in eval_domains[:idx+1]:
        
        features = eval_domain['features']
        labels = eval_domain['labels']
        
        preds = model.predict(features)
        acc = accuracy_score(labels, preds)
        
        scores.append(acc)
    
    df[f'Domain {idx+1}'] = scores + [np.nan] * (len(eval_domains) - len(scores))

{0: 253, 1: 243, 2: 255, 3: 244, 4: 262, 5: 236, 6: 250, 7: 253, 8: 254, 9: 250}
{0: 201, 1: 142, 2: 166, 3: 65, 4: 234, 5: 521, 6: 208, 7: 213, 8: 290, 9: 460}
{0: 219, 1: 186, 2: 200, 3: 137, 4: 286, 5: 383, 6: 234, 7: 218, 8: 296, 9: 341}
{0: 189, 1: 210, 2: 217, 3: 214, 4: 240, 5: 345, 6: 221, 7: 251, 8: 234, 9: 379}
{0: 186, 1: 213, 2: 267, 3: 214, 4: 263, 5: 278, 6: 241, 7: 273, 8: 207, 9: 358}
{0: 200, 1: 200, 2: 295, 3: 189, 4: 246, 5: 307, 6: 255, 7: 271, 8: 239, 9: 298}
{0: 185, 1: 231, 2: 284, 3: 198, 4: 243, 5: 301, 6: 253, 7: 257, 8: 224, 9: 324}
{0: 170, 1: 226, 2: 330, 3: 188, 4: 216, 5: 297, 6: 260, 7: 270, 8: 208, 9: 335}
{0: 162, 1: 204, 2: 309, 3: 168, 4: 271, 5: 247, 6: 284, 7: 269, 8: 242, 9: 344}
{0: 166, 1: 216, 2: 355, 3: 166, 4: 263, 5: 257, 6: 246, 7: 277, 8: 242, 9: 312}


In [33]:
print(df)

   Domain 1  Domain 2  Domain 3  Domain 4  Domain 5  Domain 6  Domain 7  \
0    0.2916    0.2784    0.2668    0.2608    0.2516    0.2488    0.2468   
1       NaN    0.2752    0.2656    0.2576    0.2536    0.2492    0.2428   
2       NaN       NaN    0.2612    0.2532    0.2516    0.2476    0.2480   
3       NaN       NaN       NaN    0.2652    0.2580    0.2516    0.2512   
4       NaN       NaN       NaN       NaN    0.2812    0.2740    0.2724   
5       NaN       NaN       NaN       NaN       NaN    0.2560    0.2488   
6       NaN       NaN       NaN       NaN       NaN       NaN    0.2520   
7       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
8       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
9       NaN       NaN       NaN       NaN       NaN       NaN       NaN   

   Domain 8  Domain 9  Domain 10  
0    0.2396    0.2380     0.2380  
1    0.2356    0.2376     0.2372  
2    0.2352    0.2348     0.2292  
3    0.2432    0.2444     0.2416  