In [23]:
import torch
import torch.nn as nn
import os
from tqdm import tqdm
from dataset import myDataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import torch.optim as optim
import matplotlib.pyplot as plt
import pandas as pd

In [24]:
#fix random seed

torch.manual_seed(0)


<torch._C.Generator at 0x10f920870>

In [25]:
# Define device variable for cuda, mps or cpu

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using CUDA')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    print('Using MPS')
else :
    device = torch.device('cpu')
    print('Using CPU')


Using MPS


In [26]:
#We want to use a resnet50 from torchvision to have the embedding of an image, use a pretrained resnet and remove the last layer

class ResNet(nn.Module):
    def __init__(self, num_classes=132):
        super(ResNet, self).__init__()
        self.resnet = torch.hub.load('pytorch/vision:v0.6.0', 'resnet50', pretrained=True)
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-1])
        self.fc = nn.Linear(2048, num_classes)
        self.resnet.eval()
        for param in self.resnet.parameters():
            param.requires_grad = False
        for param in self.fc.parameters():
            param.requires_grad = False

    def forward(self, x):
        return self.resnet(x)

In [27]:
class ClusteringModel(nn.Module):
    def __init__(self, embedding_size, num_clusters):
        super(ClusteringModel, self).__init__()
        self.num_embeddings = embedding_size
        self.num_clusters = num_clusters
        self.centers = nn.Parameter(torch.randn(num_clusters, embedding_size))

    def forward(self, embeddings):
        embeddings = embeddings.flatten(2).flatten(1)
        # Compute the distance between each embedding and each cluster center
        distances = torch.cdist(embeddings, self.centers)
        # Assign each embedding to the closest cluster
        assignments = torch.argmin(distances, dim=1)
        return assignments

In [28]:
def clustering_loss(model, embeddings):
    assignments = model(embeddings)
    # Compute the loss as the sum of the distances between each embedding and its assigned cluster center
    embeddings = embeddings.flatten(2).flatten(1)
    distances = torch.cdist(embeddings, model.centers)
    loss = torch.sum(torch.gather(distances, 1, assignments.unsqueeze(1)))
    return loss

In [29]:
resnet_model = ResNet()
model_ft_path = 'models/resnet_finetuned.pth'
if os.path.exists(model_ft_path):
    resnet_model.load_state_dict(torch.load(model_ft_path))
    print('ResNet model loaded')
else:
    print('No fintuned model found, Resnet pretrained model will be used')

resnet_model.to(device)

Using cache found in /Users/jskaf/.cache/torch/hub/pytorch_vision_v0.6.0
Exception ignored in: Traceback (most recent call last):
  File "<string>", line 1, in <module>
<function _MultiProcessingDataLoaderIter.__del__ at 0x11f379820>
Traceback (most recent call last):
  File "/Users/jskaf/Documents/Cours ECM 3A/CV/Clothes-similarity/clothes_sim/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
    self._shutdown_workers()
  File "/Users/jskaf/Documents/Cours ECM 3A/CV/Clothes-similarity/clothes_sim/lib/python3

ResNet model loaded


ResNet(
  (resnet): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(64, 25

In [30]:
#Create the dataset

# get_preprocessed_image = True
# train_test_split = 0.9
# my_path_hm = os.path.join(os.getcwd(), 'data/h&mdataset/images/')
# my_path_fash = os.path.join(os.getcwd(), 'data/fashion-dataset/images/')

# dataset = myDataset(my_path_hm, my_path_fash, get_preprocessed_image, 'hm')

# #Split the dataset into training and testing
# train_size = int(train_test_split * len(dataset))
# test_size = len(dataset) - train_size

# train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=10, pin_memory=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=10, pin_memory=True)

In [31]:
get_preprocessed_image = True
train_test_split = 0.9
my_path_hm = os.path.join(os.getcwd(), 'data/h&mdataset/images/')
my_path_fash = os.path.join(os.getcwd(), 'data/fashion-dataset/images/')

dataset = myDataset(my_path_hm, my_path_fash, get_preprocessed_image, 'hm')

#Get the indices from the file
train_subset_indices = torch.load('data/h&mdataset/train_subset_indices.pt')
val_subset_indices = torch.load('data/h&mdataset/val_subset_indices.pt')
test_subset_indices = torch.load('data/h&mdataset/test_subset_indices.pt')

test_subset_indices = torch.cat((test_subset_indices, val_subset_indices), 0)

train_dataset = torch.utils.data.Subset(dataset, train_subset_indices)
test_dataset = torch.utils.data.Subset(dataset, test_subset_indices)


train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=10, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=10, pin_memory=True)

Skipping .DS_Store as it is not a jpg file


In [32]:
print('Longueur du train_loader : ', len(train_dataset))

Longueur du train_loader :  8000


In [33]:
for batch in train_loader:
    images, idx, img_path = batch
    print(idx)
    break



KeyboardInterrupt: 

In [None]:
#Choice of cluster number : we have 143 article type 20 color, so a number between 100 et 300 can be a good choice

In [34]:
embeddings_size = 2048
range_of_n_clusters = range(200,601, 200)

In [None]:
# Initialize a list to hold the loss for each number of clusters
losses = []

# Loop over different numbers of clusters
for num_clusters in range_of_n_clusters:  # Adjust the range as needed
    # Initialize the clustering model
    clustering_model = ClusteringModel(embeddings_size, num_clusters).to(device)  # Assuming that 'model.embedding_size' is the size of your embeddings

    # Initialize the optimizer
    optimizer = optim.Adam(clustering_model.parameters(), lr=0.001)  # Adjust the learning rate as needed

    # Train the clustering model
    num_epochs = 1  # Adjust as needed
    for epoch in range(num_epochs):
        print(f'Epoch {epoch + 1}/{num_epochs}')
        for batch in tqdm(train_loader):
            imgs, _, _ = batch
            imgs = imgs.to(device)
            optimizer.zero_grad()
            embeddings = resnet_model(imgs)  # Compute the embeddings for the current batch of images
            loss = clustering_loss(clustering_model, embeddings)
            loss.backward()
            optimizer.step()
    losses.append(loss.item())

Epoch 1/1


100%|██████████| 125/125 [00:54<00:00,  2.29it/s]


Epoch 1/1


100%|██████████| 125/125 [00:55<00:00,  2.27it/s]


Epoch 1/1


100%|██████████| 125/125 [00:55<00:00,  2.25it/s]


In [None]:

# Plot the elbow graph
plt.plot(range_of_n_clusters, losses, 'bx-')
plt.xlabel('Number of Clusters')
plt.ylabel('Loss')
plt.title('Elbow Method For Optimal Number of Clusters')
#save the plot
plt.savefig('models/ElbowGraphs/elbow_plot_img.png')
plt.show()

NameError: name 'losses' is not defined

In [35]:
#Define the final number of clusters

num_clusters = 400

In [36]:
# Initialize the clustering model and train it

clustering_model = ClusteringModel(embeddings_size, num_clusters).to(device)
optimizer = optim.Adam(clustering_model.parameters(), lr=0.001)

num_epochs = 1
for epoch in range(num_epochs):
    for batch in tqdm(train_loader):
        imgs, _, _ = batch
        imgs = imgs.to(device)
        optimizer.zero_grad()
        embeddings = resnet_model(imgs)
        loss = clustering_loss(clustering_model, embeddings)
        loss.backward()
        optimizer.step()



100%|██████████| 125/125 [00:55<00:00,  2.27it/s]


In [37]:
# Save the model
torch.save(clustering_model.state_dict(), 'models/clustering_model_img.pth')

In [41]:
#Save in a dataframe, the name of training images and their cluster prediction

# Initialize a list to hold the predictions
predictions = []

# Loop over the training images
for batch in tqdm(train_loader):
    imgs, idx, img_paths = batch
    imgs = imgs.to(device)
    embeddings = resnet_model(imgs)
    cluster = clustering_model(embeddings)
    for i in range(len(idx)):
        predictions.append({'image': img_paths[i][len(img_paths[i])-18:], 'cluster': cluster[i].item()})

# Convert the list to a DataFrame
predictions_df = pd.DataFrame(predictions)

# Save the DataFrame
predictions_df.to_csv('train_predictions_img.csv', index=False)

100%|██████████| 125/125 [00:55<00:00,  2.25it/s]


In [39]:
# Idem for the test dataset 

# Initialize a list to hold the predictions
predictions = []

# Loop over the test images
for batch in tqdm(test_loader):
    imgs, idx, img_paths = batch
    imgs = imgs.to(device)
    embeddings = resnet_model(imgs)
    cluster = clustering_model(embeddings)
    for i in range(len(idx)):
        predictions.append({'image': img_paths[i][len(img_paths[i])-18:], 'cluster': cluster[i].item()})

# Convert the list to a DataFrame
predictions_df = pd.DataFrame(predictions)

# Save the DataFrame
predictions_df.to_csv('test_predictions_img.csv', index=False)

100%|██████████| 63/63 [00:28<00:00,  2.23it/s]


In [40]:
predictions_df 

Unnamed: 0,image,cluster
0,075/0759974004.jpg,129
1,080/0808882004.jpg,129
2,074/0744306009.jpg,129
3,075/0754018003.jpg,129
4,087/0878972002.jpg,129
...,...,...
1995,053/0537346013.jpg,129
1996,074/0743225001.jpg,129
1997,057/0571319003.jpg,129
1998,070/0706656001.jpg,129
