In [1]:
import torch
import os
from torchvision import datasets, transforms, models

In [2]:
train_dir = os.path.join('dataset', 'part_one_dataset', 'train_data')
eval_dir = os.path.join('dataset', 'part_one_dataset', 'eval_data')

In [3]:
train_path = os.path.join(train_dir, '1_train_data.tar.pth')
eval_path = os.path.join(eval_dir, '1_eval_data.tar.pth')

t = torch.load(train_path, weights_only = False)

In [4]:
from torchvision import models  
import torch

# Load a pre-trained ResNet model
resnet =  models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])  # Remove the last layer
resnet.eval()  # Set to evaluation mode

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = resnet.to(device)

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224 (ResNet input size)
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet normalization
])


  return torch._C._cuda_getDeviceCount() > 0


In [81]:
domains = [{} for _ in range(10)]

for j in range(10):
    
    train_path = os.path.join(train_dir, f'{j+1}_train_data.tar.pth')
    t = torch.load(train_path, weights_only = False)

    data = t['data'] # both numpy.ndarray
    
    domains[j]['labels'] = t['targets'] if 'targets' in t else None
    
    try:
        domains[j]['features']  = torch.load(f'stuff/train_embeds_{j+1}.pt', map_location = device)
        # embeds = torch.stack(embeds).to(device)
    except: 
        embeds = []
        # Convert to PyTorch tensor
        X_tensor = torch.tensor(data, dtype=torch.float32)  # Convert to tensor
        X_tensor = X_tensor.permute(0, 3, 1, 2)  # Change shape to (2500, 3, 32, 32)

        tensor = X_tensor.float()

        transformed_images = []
        for image in tensor:
            # Convert each image tensor (C, H, W) to PIL Image for transformation
            transformed_image = transform(image)  # Apply the transformations
            transformed_images.append(transformed_image)

        preprocessed_tensor = torch.stack(transformed_images)  # Shape: (2500, 3, 224, 224)

        for i in range(10) : 
            
            preprocessed_batch = preprocessed_tensor[i*250:(i+1)*250]
            preprocessed_batch = preprocessed_batch.to(device)

            # 4. Get the embeddings (feature maps)
            with torch.no_grad():  # Disable gradients for inference
                feature_maps = resnet(preprocessed_batch)  # Shape will be (batch_size, 512, 1, 1)

            # 5. Flatten the feature maps (optional)
            embeddings = feature_maps.view(feature_maps.size(0), -1)  # Flatten to shape (batch_size, embedding_size)

            embeds.append(embeddings)
        
        embeds = torch.vstack(embeds)
        eval_eval_eval_domains[j]['features'] = embeds
        
        torch.save(embeds, f'stuff/train_embeds_{j+1}.pt')

  domains[j]['features']  = torch.load(f'stuff/train_embeds_{j+1}.pt', map_location = device)


In [50]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

class LWP:
    def __init__(self):
        self.prototypes = {}
        self.class_counts = {i:0 for i in range(10)}
    
    def fit(self, features, labels):
        unique_labels = np.unique(labels)
        
        for label in unique_labels:
            
            samples = features[labels == label]
            num_samples = len(samples)
            
            if label not in self.prototypes:
                self.prototypes[label] = samples.mean(axis=0)
                self.class_counts[label] = len(samples)
            else:
                self.class_counts[label] += len(samples)
                self.prototypes[label] = (self.class_counts[label] - num_samples) / self.class_counts[label] * self.prototypes[label] + num_samples/ self.class_counts[label] * samples.mean(axis=0)
                
            
    def predict(self, features):
        preds = []
        for feature in features:
            distances = {label: np.linalg.norm(feature - proto) for label, proto in self.prototypes.items()}
            preds.append(min(distances, key=distances.get))
        return np.array(preds)

In [51]:
def sample_from_gmms(gmms, n_samples, sampling_probabilities, num_classes = 10):
    pseudo_features = []
    pseudo_labels = []
    
    for i in range(num_classes):
        # Determine the number of samples for this class based on its probability
        num_class_samples = int(n_samples * sampling_probabilities[i])
        
        # Sample from the ith GMM
        class_samples, _ = gmms[i].sample(num_class_samples)
        
        # Append the samples and corresponding class labels
        pseudo_features.append(class_samples)
        pseudo_labels.extend([i] * num_class_samples)
    
    # Concatenate the features and labels
    pseudo_features = np.concatenate(pseudo_features, axis=0)
    pseudo_labels = np.array(pseudo_labels)
    
    return pseudo_features, pseudo_labels

In [None]:
# from sklearn.mixture import GaussianMixture

# num_classes = 10
# buffer_size_per_class = 250
# models = []

# buffer_dataset = {'features': [], 'labels': []}
# source_dataset = domains[0]
# gmms = [None] * num_classes

# model = LWP()
# model.fit(source_dataset['features'], source_dataset['labels'])
# models.append(model)

# class_frequencies = [np.sum(source_dataset['labels'] == i) for i in range(num_classes)]
# total_samples = np.sum(class_frequencies)
# sampling_probabilities = np.array(class_frequencies) / total_samples

# # Update GMM Models
# for i in range(num_classes):
#     gmms[i] = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
#     gmms[i].fit(source_dataset['features'][source_dataset['labels'] == i].to('cpu').numpy())
    
# for i in range(num_classes):
#     # Get all the samples of class 'i' from the current dataset
#     class_samples = source_dataset['features'][source_dataset['labels'] == i].to('cpu').numpy()
    
#     # Get the mean (centroid) of the class from the GMM
#     class_mean = gmms[i].means_.mean(axis=0)  # Use the mean of the GMM components
    
#     # Compute the distance of each sample to the class mean
#     distances = np.linalg.norm(class_samples - class_mean, axis=1)
    
#     # Select the 'buffer_size_per_class' least distant samples
#     least_distant_indices = np.argsort(distances)[:buffer_size_per_class]
    
#     # Add these least distant samples to the buffer
#     buffer_dataset['features'].append(class_samples[least_distant_indices])
#     buffer_dataset['labels'].append([i] * buffer_size_per_class)

# # Convert buffer_dataset to numpy arrays
# buffer_dataset['features'] = np.concatenate(buffer_dataset['features'], axis=0)
# buffer_dataset['labels'] = np.concatenate(buffer_dataset['labels'], axis=0)

In [None]:
# pseudo_size = 2500
# num_iters = 10

# for i in range(1, 10) :
#     curr_dataset = domains[i]['features']
#     curr_dataset_labels = model.predict(curr_dataset)
    
#     pseudo_dataset = {'features': [], 'labels': []}
    
#     pseudo_dataset['features'], pseudo_dataset['labels'] = sample_from_gmms(gmms, pseudo_size, sampling_probabilities, num_classes = 10)
    
#     for j in range(num_iters):
#         batch_size_pseudo = len(pseudo_dataset['features']) // num_iters
#         batch_pseudo = pseudo_dataset['features'][j*batch_size_pseudo:(j+1)*batch_size_pseudo]
#         batch_pseudo_labels = pseudo_dataset['labels'][j*batch_size_pseudo:(j+1)*batch_size_pseudo]
        
#         batch_size_curr = len(curr_dataset) // num_iters
#         batch_curr = curr_dataset[j*batch_size_curr:(j+1)*batch_size_curr]
#         batch_curr_labels = curr_dataset_labels[j*batch_size_curr:(j+1)*batch_size_curr]
        
#         # Combine the current dataset with the pseudo dataset
#         batch = np.concatenate([batch_curr, batch_pseudo], axis=0)
#         batch_labels = np.concatenate([batch_curr_labels, batch_pseudo_labels], axis=0)
        
#         model.fit(batch, batch_labels)
        
#     models.append(model)
    
#     # Update GMM Models
#     for i in range(num_classes):
#         gmms[i] = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
#         gmms[i].fit(pseudo_dataset['features'][pseudo_dataset['labels'] == i])
    
#     new_buffer = []
#     new_buffer_labels = []
    
#     for i in range(num_classes):
#         # Get all the samples of class 'i' from the current dataset
#         class_samples = pseudo_dataset['features'][pseudo_dataset['labels'] == i]
        
#         # Get the mean (centroid) of the class from the GMM
#         class_mean = gmms[i].means_.mean(axis=0)  # Use the mean of the GMM components
        
#         # Compute the distance of each sample to the class mean
#         distances = np.linalg.norm(class_samples - class_mean, axis=1)
        
#         # Select the 'buffer_size_per_class' least distant samples
#         least_distant_indices = np.argsort(distances)[:buffer_size_per_class]
        
#         # Add these least distant samples to the buffer
#         new_buffer.append(class_samples[least_distant_indices])
#         new_buffer_labels.append([i] * buffer_size_per_class)

#     new_buffer = np.concatenate(new_buffer, axis=0)
#     new_buffer_labels = np.concatenate(new_buffer_labels, axis=0)

#     # Convert buffer_dataset to numpy arrays
#     buffer_dataset['features'] = np.concatenate([buffer_dataset['features'], new_buffer], axis=0)
#     buffer_dataset['labels'] = np.concatenate([buffer_dataset['labels'], new_buffer_labels], axis=0)

## Get Eval Data
Evaluating on trainset for now

In [53]:
eval_domains = [{} for _ in range(10)]

for j in range(10):
    
    eval_path = os.path.join(eval_dir, f'{j+1}_eval_data.tar.pth')
    t = torch.load(eval_path, weights_only = False)

    data = t['data'] # both numpy.ndarray
    
    eval_domains[j]['labels'] = t['targets'] if 'targets' in t else None
    
    try:
        eval_domains[j]['features']  = torch.load(f'stuff/eval_embeds_{j+1}.pt', map_location = device)
    except: 
        embeds = []
        # Convert to PyTorch tensor
        X_tensor = torch.tensor(data, dtype=torch.float32)  # Convert to tensor
        X_tensor = X_tensor.permute(0, 3, 1, 2)  # Change shape to (2500, 3, 32, 32)

        tensor = X_tensor.float()

        transformed_images = []
        for image in tensor:
            # Convert each image tensor (C, H, W) to PIL Image for transformation
            transformed_image = transform(image)  # Apply the transformations
            transformed_images.append(transformed_image)

        preprocessed_tensor = torch.stack(transformed_images)  # Shape: (2500, 3, 224, 224)

        for i in range(10) : 
            
            preprocessed_batch = preprocessed_tensor[i*250:(i+1)*250]
            preprocessed_batch = preprocessed_batch.to(device)

            # 4. Get the embeddings (feature maps)
            with torch.no_grad():  # Disable gradients for inference
                feature_maps = resnet(preprocessed_batch)  # Shape will be (batch_size, 512, 1, 1)

            # 5. Flatten the feature maps (optional)
            embeddings = feature_maps.view(feature_maps.size(0), -1)  # Flatten to shape (batch_size, embedding_size)

            embeds.append(embeddings)
        
        embeds = torch.vstack(embeds)
        eval_domains[j]['features'] = embeds
        
        torch.save(embeds, f'stuff/eval_embeds_{j+1}.pt')

  eval_domains[j]['features']  = torch.load(f'stuff/eval_embeds_{j+1}.pt', map_location = device)


In [68]:
from sklearn.metrics import accuracy_score
import pandas as pd

model = LWP()
model.fit(domains[0]['features'], domains[0]['labels'])

df = pd.DataFrame()

for idx,domain in enumerate(domains):
    
    x_test = domain['features']
    y_pred = model.predict(x_test) if domain['labels'] is None else domain['labels']
    
    model.fit(x_test, y_pred)
    
    scores = []
    for eval_domain in eval_domains[:idx+1]:
        
        features = eval_domain['features']
        labels = eval_domain['labels']
        
        preds = model.predict(features)
        acc = accuracy_score(labels, preds)
        
        scores.append(acc)
    
    df[f'Domain {idx+1}'] = scores + [np.nan] * (len(eval_domains) - len(scores))

In [69]:
print(df)

   Domain 1  Domain 2  Domain 3  Domain 4  Domain 5  Domain 6  Domain 7  \
0     0.226    0.2016    0.1988    0.1952    0.1956    0.1932    0.1916   
1       NaN    0.2064    0.2044    0.1996    0.1936    0.1928    0.1884   
2       NaN       NaN    0.1816    0.1800    0.1808    0.1784    0.1796   
3       NaN       NaN       NaN    0.1912    0.1848    0.1840    0.1832   
4       NaN       NaN       NaN       NaN    0.2016    0.1968    0.1952   
5       NaN       NaN       NaN       NaN       NaN    0.1704    0.1684   
6       NaN       NaN       NaN       NaN       NaN       NaN    0.1908   
7       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
8       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
9       NaN       NaN       NaN       NaN       NaN       NaN       NaN   

   Domain 8  Domain 9  Domain 10  
0    0.1896    0.1888     0.1888  
1    0.1852    0.1840     0.1812  
2    0.1756    0.1732     0.1704  
3    0.1812    0.1800     0.1800  