In [1]:
import torch
import os

In [2]:
train_dir = os.path.join('dataset', 'part_one_dataset', 'train_data')
eval_dir = os.path.join('dataset', 'part_one_dataset', 'eval_data')
one_embeds_dir = os.path.join('part_1_vit_embeds')
two_embeds_dir = os.path.join('part_2_vit_embeds')

In [3]:
domains = [{} for _ in range(20)]

for j in range(10):
    
    train_path = os.path.join(train_dir, f'{j+1}_train_data.tar.pth')
    t = torch.load(train_path, weights_only = False)
    
    domains[j]['labels'] = t['targets'] if 'targets' in t else None
    domains[j]['features'] = torch.load(os.path.join(one_embeds_dir,f'train_embeds_{j+1}.pt'), weights_only = False)
    
eval_domains = [{} for _ in range(20)]

for j in range(10):
    
    eval_path = os.path.join(eval_dir, f'{j+1}_eval_data.tar.pth')
    t = torch.load(eval_path, weights_only = False)

    data = t['data'] # both numpy.ndarray
    
    eval_domains[j]['labels'] = t['targets'] if 'targets' in t else None
    eval_domains[j]['features'] = torch.load(os.path.join(one_embeds_dir,f'eval_embeds_{j+1}.pt'), weights_only = False)
    
for j in range(10):
    
    train_path = os.path.join(train_dir, f'{j+1}_train_data.tar.pth')
    t = torch.load(train_path, weights_only = False)
    
    domains[j+10]['labels'] = t['targets'] if 'targets' in t else None
    domains[j+10]['features'] = torch.load(os.path.join(two_embeds_dir,f'train_embeds_{j+1}.pt'), weights_only = False)
    
for j in range(10):
    
    train_path = os.path.join(eval_dir, f'{j+1}_eval_data.tar.pth')
    t = torch.load(train_path, weights_only = False)
    
    eval_domains[j+10]['labels'] = t['targets'] if 'targets' in t else None
    eval_domains[j+10]['features'] = torch.load(os.path.join(two_embeds_dir,f'train_embeds_{j+1}.pt'), weights_only = False)
    
    

In [4]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, manhattan_distances

class LWP:
    """Learning Vector Prototypes with configurable distance function"""
    
    DISTANCE_FUNCTIONS = {
        'euclidean': lambda x, y: np.linalg.norm(x - y),
        'cosine': lambda x, y: cosine_distances(x.reshape(1, -1), y.reshape(1, -1))[0][0],
        'manhattan': lambda x, y: manhattan_distances(x.reshape(1, -1), y.reshape(1, -1))[0][0],
        'minkowski': lambda x, y, p=2: np.power(np.sum(np.power(np.abs(x - y), p)), 1/p)
    }
    
    def __init__(self, distance_metric='euclidean', **distance_params):
        """
            distance_params (dict): Additional parameters for the distance function
        """
        self.prototypes = {}
        self.class_counts = {i: 0 for i in range(10)}
        
        if callable(distance_metric):
            self.distance_fn = distance_metric
        elif distance_metric in self.DISTANCE_FUNCTIONS:
            if distance_metric == 'minkowski':
                p = distance_params.get('p', 2)
                self.distance_fn = lambda x, y: self.DISTANCE_FUNCTIONS[distance_metric](x, y, p)
            else:
                self.distance_fn = self.DISTANCE_FUNCTIONS[distance_metric]
        else:
            raise ValueError(f"Unknown distance metric: {distance_metric}. " 
                           f"Available metrics: {list(self.DISTANCE_FUNCTIONS.keys())}")

    def fit(self, features, labels):
        unique_labels = np.unique(labels)
        for label in unique_labels:
            samples = features[labels == label]
            num_samples = len(samples)
            
            if label not in self.prototypes:  # Original condition was: if label not in self.prototypes
                self.prototypes[label] = samples.mean(axis=0)
                self.class_counts[label] += len(samples)
            else:
                self.class_counts[label] += len(samples)
                self.prototypes[label] = (
                    (self.class_counts[label] - num_samples) / self.class_counts[label] * self.prototypes[label] +
                    num_samples / self.class_counts[label] * samples.mean(axis=0)
                )

    def predict(self, features):
        preds = []
        for feature in features:
            distances = {
                label: self.distance_fn(feature, proto)
                for label, proto in self.prototypes.items()
            }
            preds.append(min(distances, key=distances.get))
        return np.array(preds)

In [5]:
def sample_from_gmms(gmms, n_samples, class_counts, num_classes = 10):
    pseudo_features = []
    pseudo_labels = []
    sampling_probabilities = class_counts / np.sum(class_counts)
    print('class counts are' , class_counts)
    
    for i in range(num_classes):
        # Determine the number of samples for this class based on its probability
        num_class_samples = int(n_samples * sampling_probabilities[i])
        
        # Sample from the ith GMM
        class_samples, _ = gmms[i].sample(num_class_samples)
        
        # Append the samples and corresponding class labels
        pseudo_features.append(class_samples)
        pseudo_labels.extend([i] * num_class_samples)
    
    # Concatenate the features and labels
    pseudo_features = np.concatenate(pseudo_features, axis=0)
    pseudo_labels = np.array(pseudo_labels)
    
    return pseudo_features, pseudo_labels

In [6]:
from sklearn.mixture import GaussianMixture

num_classes = 10
models = []

source_dataset = domains[0]
gmms = [None] * num_classes

model = LWP(distance_metric='cosine')
model.fit(source_dataset['features'], source_dataset['labels'])
models.append(model)

class_frequencies = [np.sum(source_dataset['labels'] == i) for i in range(num_classes)]
total_samples = np.sum(class_frequencies)
sampling_probabilities = np.array(class_frequencies) / total_samples

# Update GMM Models
for i in range(num_classes):
    gmms[i] = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
    gmms[i].fit(source_dataset['features'][source_dataset['labels'] == i])

In [7]:
pseudo_size = 2500
num_iters = 10

for i in range(1, 20) :
    curr_dataset = domains[i]['features']
    curr_dataset_labels = model.predict(curr_dataset)
    
    pseudo_dataset = {'features': [], 'labels': []}
    
    pseudo_dataset['features'], pseudo_dataset['labels'] = sample_from_gmms(gmms, pseudo_size, list(model.class_counts.values()), num_classes = 10)
    
    # for j in range(num_iters):
    #     batch_size_pseudo = len(pseudo_dataset['features']) // num_iters
    #     batch_pseudo = pseudo_dataset['features'][j*batch_size_pseudo:(j+1)*batch_size_pseudo]
    #     batch_pseudo_labels = pseudo_dataset['labels'][j*batch_size_pseudo:(j+1)*batch_size_pseudo]
        
    #     batch_size_curr = len(curr_dataset) // num_iters
    #     batch_curr = curr_dataset[j*batch_size_curr:(j+1)*batch_size_curr]
    #     batch_curr_labels = curr_dataset_labels[j*batch_size_curr:(j+1)*batch_size_curr]
        
    #     # Combine the current dataset with the pseudo dataset
    #     batch = np.concatenate([batch_curr, batch_pseudo], axis=0)
    #     batch_labels = np.concatenate([batch_curr_labels, batch_pseudo_labels], axis=0)
    
    new_dataset = {}
    new_dataset['features'] = np.concatenate([pseudo_dataset['features'], curr_dataset])
    new_dataset['labels'] = np.concatenate([pseudo_dataset['labels'], curr_dataset_labels])
    model.fit(new_dataset['features'], new_dataset['labels'])
    
    models.append(model)
    
    # Update GMM Models
    for i in range(num_classes):
        gmms[i] = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
        gmms[i].fit(new_dataset['features'][new_dataset['labels'] == i])

class counts are [253, 243, 255, 244, 262, 236, 250, 253, 254, 250]
class counts are [743, 753, 727, 764, 780, 723, 753, 761, 747, 748]
class counts are [1255, 1252, 1173, 1277, 1316, 1243, 1262, 1234, 1249, 1235]
class counts are [1755, 1768, 1604, 1809, 1854, 1740, 1763, 1705, 1738, 1755]
class counts are [2257, 2289, 2034, 2322, 2395, 2264, 2250, 2175, 2224, 2274]
class counts are [2751, 2774, 2479, 2842, 2953, 2807, 2740, 2649, 2701, 2783]
class counts are [3250, 3268, 2927, 3349, 3511, 3331, 3222, 3111, 3219, 3287]
class counts are [3753, 3770, 3391, 3860, 4059, 3878, 3699, 3558, 3702, 3801]
class counts are [4248, 4287, 3815, 4375, 4627, 4389, 4175, 4024, 4204, 4321]
class counts are [4738, 4780, 4242, 4900, 5181, 4953, 4654, 4480, 4697, 4835]
class counts are [5239, 5294, 4674, 5509, 5735, 5495, 5144, 4875, 5177, 5312]
class counts are [5931, 5652, 4971, 6056, 6276, 6062, 5739, 5243, 5737, 5782]
class counts are [6438, 6143, 5351, 6609, 6838, 6582, 6274, 5663, 6256, 6290]
class 

In [8]:
from sklearn.metrics import accuracy_score
import pandas as pd

df = pd.DataFrame()

for idx,model in enumerate(models) :
    
    scores = []
    for eval_domain in eval_domains[:idx+1]:
        
        features = eval_domain['features']
        labels = eval_domain['labels']
        
        preds = model.predict(features)
        acc = accuracy_score(labels, preds)
        
        scores.append(acc)
    
    df[f'Domain {idx+1}'] = scores + [np.nan] * (len(eval_domains) - len(scores))

In [9]:
print(df)

    Domain 1  Domain 2  Domain 3  Domain 4  Domain 5  Domain 6  Domain 7  \
0     0.8812    0.8812    0.8812    0.8812    0.8812    0.8812    0.8812   
1        NaN    0.8892    0.8892    0.8892    0.8892    0.8892    0.8892   
2        NaN       NaN    0.8936    0.8936    0.8936    0.8936    0.8936   
3        NaN       NaN       NaN    0.9064    0.9064    0.9064    0.9064   
4        NaN       NaN       NaN       NaN    0.8936    0.8936    0.8936   
5        NaN       NaN       NaN       NaN       NaN    0.9024    0.9024   
6        NaN       NaN       NaN       NaN       NaN       NaN    0.8948   
7        NaN       NaN       NaN       NaN       NaN       NaN       NaN   
8        NaN       NaN       NaN       NaN       NaN       NaN       NaN   
9        NaN       NaN       NaN       NaN       NaN       NaN       NaN   
10       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
11       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
12       NaN