## Imports

In [3]:
!pip install scikit-learn-extra

import numpy as np
import scipy as sp
import pandas as pd
import sklearn
from sklearn.preprocessing import normalize
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
import torch
from torch.utils.data import TensorDataset, DataLoader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Data loading and processing

In [4]:
data = pd.read_csv('IML_project2_data_pool.csv', sep=',').to_numpy()
data

array([[ -9.7759,  -8.8555,  -8.2527, ...,  65.6239,  39.4626,  30.8416],
       [ -9.5503,  -8.2558,  -9.4273, ..., 123.0335, 109.8054, 127.3275],
       [-15.3899, -11.1862, -16.8448, ...,  61.4511,  64.4757,  91.7237],
       ...,
       [-17.4385, -10.4626, -20.2318, ..., 116.3426, 117.7207,  85.5008],
       [ -1.5759,  -1.7234,  -1.8101, ...,  17.6209,  10.4274,  15.5562],
       [-10.4536,  -9.734 ,  -9.5061, ...,  96.888 ,  81.8133,  50.9756]])

In [5]:
data = normalize(data)
data

array([[-0.00362784, -0.00328628, -0.00306258, ...,  0.02435308,
         0.0146446 ,  0.01144534],
       [-0.00258363, -0.00223343, -0.00255036, ...,  0.0332841 ,
         0.02970552,  0.03444575],
       [-0.00266154, -0.00193455, -0.00291315, ...,  0.01062738,
         0.01115045,  0.01586274],
       ...,
       [-0.0029237 , -0.00175413, -0.00339201, ...,  0.01950572,
         0.01973677,  0.01433486],
       [-0.0015711 , -0.00171815, -0.00180458, ...,  0.0175672 ,
         0.01039562,  0.0155088 ],
       [-0.00181005, -0.00168545, -0.00164599, ...,  0.01677626,
         0.01416606,  0.00882648]])

## KMedoids

In [6]:
points = []
for size in [100, 200, 500]:
    clustering = KMedoids(n_clusters=size)
    clustering.fit(data)
    points.append(clustering.medoid_indices_+1)

## AgglomerativeClustering

In [7]:
points = [[] for _ in range(3)]
for i, size in enumerate([100, 200, 500]):
    clustering = AgglomerativeClustering(n_clusters=size, linkage='single').fit(data)
    categories = np.unique(clustering.labels_)
    
    for category in categories:
        indices = np.where(clustering.labels_  == category)[0]

        km = KMedoids(n_clusters=1)
        km.fit(data[indices])
        ind = indices[km.medoid_indices_[0]]

        points[i].append(ind+1)

## Autoencoder + KMeans

In [26]:
class AE(torch.nn.Module):
    def __init__(self):
        super().__init__()
                        
        
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(1680, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 4),
            torch.nn.ReLU(),
            torch.nn.Linear(4, 2)
        )
                              
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(2, 4),
            torch.nn.ReLU(),
            torch.nn.Linear(4, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 1680),
            torch.nn.Sigmoid()
            )
                                                                                                                                                                                                                
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [27]:
tensor_data = torch.Tensor(data)

my_dataset = TensorDataset(tensor_data)
my_dataloader = DataLoader(my_dataset, batch_size=500)

model = AE()
loss_function = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

In [28]:
epochs = 25

for epoch in range(epochs):
    for i, [sample] in enumerate(my_dataloader):
        reconstructed = model(sample)
        loss = loss_function(reconstructed, sample)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f'epoch = {epoch}, i = {i}, loss = {loss}')

epoch = 0, i = 0, loss = 0.2442234605550766
epoch = 0, i = 1, loss = 0.24187980592250824
epoch = 0, i = 2, loss = 0.23963475227355957
epoch = 0, i = 3, loss = 0.2373533695936203
epoch = 0, i = 4, loss = 0.23502710461616516
epoch = 0, i = 5, loss = 0.23273402452468872
epoch = 0, i = 6, loss = 0.23035535216331482
epoch = 0, i = 7, loss = 0.22801733016967773
epoch = 0, i = 8, loss = 0.22564460337162018
epoch = 0, i = 9, loss = 0.2231389731168747
epoch = 0, i = 10, loss = 0.22074304521083832
epoch = 0, i = 11, loss = 0.21820680797100067
epoch = 0, i = 12, loss = 0.21570582687854767
epoch = 0, i = 13, loss = 0.21316103637218475
epoch = 0, i = 14, loss = 0.21051333844661713
epoch = 0, i = 15, loss = 0.20796875655651093
epoch = 0, i = 16, loss = 0.20520365238189697
epoch = 0, i = 17, loss = 0.2025308609008789
epoch = 0, i = 18, loss = 0.1998332440853119
epoch = 0, i = 19, loss = 0.1969798058271408
epoch = 0, i = 20, loss = 0.1941196471452713
epoch = 0, i = 21, loss = 0.19122549891471863
epoch

In [29]:
data = torch.Tensor(data)
encoded_data = model.encoder(data)
encoded_data = encoded_data.detach().numpy()

In [33]:
points = [[] for _ in range(3)]
def get_inds(n):
    clustering = KMeans(n_clusters=n)
    clustering.fit(encoded_data)

    p = []
    labels = np.unique(clustering.labels_)
            
    for label in labels:
        indices = np.where(clustering.labels_  == label)[0]

        km = KMedoids(n_clusters=1)
        km.fit(encoded_data[indices])
        ind = indices[km.medoid_indices_[0]]

        p.append(ind+1)

    return p

In [34]:
points[0] = get_inds(100)
points[1] = get_inds(200)
points[2] = get_inds(500)

In [36]:
print(len(points[0]))
print(len(points[1]))
print(len(points[2]))

100
200
500


## Saving results

In [37]:
res = ''
for i, ps in enumerate(points):
    s = ','
    s=s.join(map(str, ps))
    res += s+'\n'

    with open("results.txt", "w") as file:
        file.write(res)

## Report


As the first method I used Kmedoids. I set the number of clusters as 100, 200 and 500 respectively, and then I used those medoids as a initial batch. Due to forgetting, that submitting indeces should start from 1, the first submission was random one. Second attempt was correct, but results were worse than those for random one.

As the second method I used AgglomerativeClustering. I created 100, 200 and 500 clusters respectively, and used medoids of this clusters as initial batch. I tried two linkage criterions: ward and single. Unfortunately the results were still worse than those for random initial batch.

As the last method I used Autoencoder+KMeans. Firstly I trained Autoencoder with 2 dimensional latent space and then I used trained encoder to reduce dimensions of data, then I used KMeans to create clusters and KMedoids to find medoids of this clusters and used them as initial batch. Finally results where better than those for random initial batch. At least for some model and hyperparameters.