In [4]:
import torch
import os

In [5]:
train_dir = os.path.join('dataset', 'part_one_dataset', 'train_data')
eval_dir = os.path.join('dataset', 'part_one_dataset', 'eval_data')
save_dir = os.path.join('vit_embeds')

In [None]:
domains = [{} for _ in range(20)]

for j in range(10):
    
    train_path = os.path.join(train_dir, f'{j+1}_train_data.tar.pth')
    t = torch.load(train_path, weights_only = False)
    
    domains[j]['labels'] = t['targets'] if 'targets' in t else None
    domains[j]['features'] = torch.load(os.path.join(save_dir,f'train_embeds_{j+1}.pt'))

  domains[j]['features'] = torch.load(os.path.join(save_dir,f'train_embeds_{j+1}.pt'))


In [None]:
eval_domains = [{} for _ in range(20)]

for j in range(10):
    
    eval_path = os.path.join(eval_dir, f'{j+1}_eval_data.tar.pth')
    t = torch.load(eval_path, weights_only = False)

    data = t['data'] # both numpy.ndarray
    
    eval_domains[j]['labels'] = t['targets'] if 'targets' in t else None
    eval_domains[j]['features'] = torch.load(os.path.join(save_dir,f'eval_embeds_{j+1}.pt'))

  eval_domains[j]['features'] = torch.load(os.path.join(save_dir,f'eval_embeds_{j+1}.pt'))


In [None]:
train_dir = os.path.join('dataset', 'part_two_dataset', 'train_data')
eval_dir = os.path.join('dataset', 'part_two_dataset', 'eval_data')
save_dir = os.path.join('part_2_vit_embeds')

In [None]:
for j in range(10):
    
    train_path = os.path.join(train_dir, f'{j+1}_train_data.tar.pth')
    t = torch.load(train_path, weights_only = False)
    
    domains[j+10]['labels'] = t['targets'] if 'targets' in t else None
    domains[j+10]['features'] = torch.load(os.path.join(save_dir,f'train_embeds_{j+1}.pt'))

  domains[j]['features'] = torch.load(os.path.join(save_dir,f'train_embeds_{j+1}.pt'))


In [None]:
for j in range(10):
    
    train_path = os.path.join(eval_dir, f'{j+1}_eval_data.tar.pth')
    t = torch.load(train_path, weights_only = False)
    
    eval_domains[j+10]['labels'] = t['targets'] if 'targets' in t else None
    domains[j+10]['features'] = torch.load(os.path.join(save_dir,f'train_embeds_{j+1}.pt'))

  domains[j]['features'] = torch.load(os.path.join(save_dir,f'train_embeds_{j+1}.pt'))


In [7]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

class LWP:
    def __init__(self):
        self.prototypes = {}
        self.class_counts = {i:0 for i in range(10)}
    
    def fit(self, features, labels):
        unique_labels = np.unique(labels)
        
        for label in unique_labels:
            
            samples = features[labels == label]
            num_samples = len(samples)
            
            # if label not in self.prototypes:
            if True :
                self.prototypes[label] = samples.mean(axis=0)
                self.class_counts[label] = len(samples)
            else:
                self.class_counts[label] += len(samples)
                self.prototypes[label] = (self.class_counts[label] - num_samples) / self.class_counts[label] * self.prototypes[label] + num_samples/ self.class_counts[label] * samples.mean(axis=0)
                
            
    def predict(self, features):
        preds = []
        for feature in features:
            distances = {label: np.linalg.norm(feature - proto) for label, proto in self.prototypes.items()}
            preds.append(min(distances, key=distances.get))
        return np.array(preds)

In [8]:
def sample_from_gmms(gmms, n_samples, sampling_probabilities, num_classes = 10):
    pseudo_features = []
    pseudo_labels = []
    
    for i in range(num_classes):
        # Determine the number of samples for this class based on its probability
        num_class_samples = int(n_samples * sampling_probabilities[i])
        
        # Sample from the ith GMM
        class_samples, _ = gmms[i].sample(num_class_samples)
        
        # Append the samples and corresponding class labels
        pseudo_features.append(class_samples)
        pseudo_labels.extend([i] * num_class_samples)
    
    # Concatenate the features and labels
    pseudo_features = np.concatenate(pseudo_features, axis=0)
    pseudo_labels = np.array(pseudo_labels)
    
    return pseudo_features, pseudo_labels

In [14]:
from sklearn.mixture import GaussianMixture

num_classes = 10
buffer_size_per_class = 250
models = []

buffer_dataset = {'features': [], 'labels': []}
source_dataset = domains[0]
gmms = [None] * num_classes

model = LWP()
model.fit(source_dataset['features'], source_dataset['labels'])
models.append(model)

class_frequencies = [np.sum(source_dataset['labels'] == i) for i in range(num_classes)]
total_samples = np.sum(class_frequencies)
sampling_probabilities = np.array(class_frequencies) / total_samples

# Update GMM Models
for i in range(num_classes):
    gmms[i] = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
    gmms[i].fit(source_dataset['features'][source_dataset['labels'] == i])
    
for i in range(num_classes):
    # Get all the samples of class 'i' from the current dataset
    class_samples = source_dataset['features'][source_dataset['labels'] == i]
    
    # Get the mean (centroid) of the class from the GMM
    class_mean = gmms[i].means_.mean(axis=0)  # Use the mean of the GMM components
    
    # Compute the distance of each sample to the class mean
    distances = np.linalg.norm(class_samples - class_mean, axis=1)
    
    # Select the 'buffer_size_per_class' least distant samples
    least_distant_indices = np.argsort(distances)[:buffer_size_per_class]
    
    # Add these least distant samples to the buffer
    buffer_dataset['features'].append(class_samples[least_distant_indices])
    buffer_dataset['labels'].append([i] * buffer_size_per_class)

# Convert buffer_dataset to numpy arrays
buffer_dataset['features'] = np.concatenate(buffer_dataset['features'], axis=0)
buffer_dataset['labels'] = np.concatenate(buffer_dataset['labels'], axis=0)

In [None]:
pseudo_size = 2500
num_iters = 10

for i in range(1, 20) :
    curr_dataset = domains[i]['features']
    curr_dataset_labels = model.predict(curr_dataset)
    
    pseudo_dataset = {'features': [], 'labels': []}
    
    pseudo_dataset['features'], pseudo_dataset['labels'] = sample_from_gmms(gmms, pseudo_size, sampling_probabilities, num_classes = 10)
    
    for j in range(num_iters):
        batch_size_pseudo = len(pseudo_dataset['features']) // num_iters
        batch_pseudo = pseudo_dataset['features'][j*batch_size_pseudo:(j+1)*batch_size_pseudo]
        batch_pseudo_labels = pseudo_dataset['labels'][j*batch_size_pseudo:(j+1)*batch_size_pseudo]
        
        batch_size_curr = len(curr_dataset) // num_iters
        batch_curr = curr_dataset[j*batch_size_curr:(j+1)*batch_size_curr]
        batch_curr_labels = curr_dataset_labels[j*batch_size_curr:(j+1)*batch_size_curr]
        
        # Combine the current dataset with the pseudo dataset
        batch = np.concatenate([batch_curr, batch_pseudo], axis=0)
        batch_labels = np.concatenate([batch_curr_labels, batch_pseudo_labels], axis=0)
        
        model.fit(batch, batch_labels)
        
    models.append(model)
    
    # Update GMM Models
    for i in range(num_classes):
        gmms[i] = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
        gmms[i].fit(pseudo_dataset['features'][pseudo_dataset['labels'] == i])
    
    new_buffer = []
    new_buffer_labels = []
    
    for i in range(num_classes):
        # Get all the samples of class 'i' from the current dataset
        class_samples = pseudo_dataset['features'][pseudo_dataset['labels'] == i]
        
        # Get the mean (centroid) of the class from the GMM
        class_mean = gmms[i].means_.mean(axis=0)  # Use the mean of the GMM components
        
        # Compute the distance of each sample to the class mean
        distances = np.linalg.norm(class_samples - class_mean, axis=1)
        
        # Select the 'buffer_size_per_class' least distant samples
        least_distant_indices = np.argsort(distances)[:buffer_size_per_class]
        
        # Add these least distant samples to the buffer
        new_buffer.append(class_samples[least_distant_indices])
        new_buffer_labels.append([i] * buffer_size_per_class)

    new_buffer = np.concatenate(new_buffer, axis=0)
    new_buffer_labels = np.concatenate(new_buffer_labels, axis=0)

    # Convert buffer_dataset to numpy arrays
    buffer_dataset['features'] = np.concatenate([buffer_dataset['features'], new_buffer], axis=0)
    buffer_dataset['labels'] = np.concatenate([buffer_dataset['labels'], new_buffer_labels], axis=0)

In [17]:
from sklearn.metrics import accuracy_score
import pandas as pd

model = LWP()
model.fit(domains[0]['features'], domains[0]['labels'])

df = pd.DataFrame()

for idx,model in enumerate(models) :
    
    scores = []
    for eval_domain in eval_domains[:idx+1]:
        
        features = eval_domain['features']
        labels = eval_domain['labels']
        
        preds = model.predict(features)
        acc = accuracy_score(labels, preds)
        
        scores.append(acc)
    
    df[f'Domain {idx+1}'] = scores + [np.nan] * (len(eval_domains) - len(scores))

In [18]:
print(df)

   Domain 1  Domain 2  Domain 3  Domain 4  Domain 5  Domain 6  Domain 7  \
0    0.8384    0.8384    0.8384    0.8384    0.8384    0.8384    0.8384   
1       NaN    0.8416    0.8416    0.8416    0.8416    0.8416    0.8416   
2       NaN       NaN    0.8444    0.8444    0.8444    0.8444    0.8444   
3       NaN       NaN       NaN    0.8676    0.8676    0.8676    0.8676   
4       NaN       NaN       NaN       NaN    0.8552    0.8552    0.8552   
5       NaN       NaN       NaN       NaN       NaN    0.8676    0.8676   
6       NaN       NaN       NaN       NaN       NaN       NaN    0.8532   
7       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
8       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
9       NaN       NaN       NaN       NaN       NaN       NaN       NaN   

   Domain 8  Domain 9  Domain 10  
0    0.8384    0.8384     0.8384  
1    0.8416    0.8416     0.8416  
2    0.8444    0.8444     0.8444  
3    0.8676    0.8676     0.8676  