In [1]:
#Load clustering models for tabular and image data

import os
import numpy as np
import pandas as pd
import pickle
import torch
from tqdm import tqdm
import torch.nn as nn
import networkx as nx
#Svd from scipy for sparse matrix
from scipy.sparse.linalg import svds

In [2]:
#Disable warnings
import warnings
warnings.filterwarnings('ignore')


In [3]:
#Load tabular data clustering model using joblib
def load_tabular_model(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    return model

#Load image data clustering model
def load_image_clustering_model(model_path):
    model = torch.load(model_path)
    return model

In [4]:
model_path_img = 'models/clustering_model_img_ft_1.pth'
model_path_tab = 'models/kmeans_model_ft_1.pkl'
scaler_path = 'models/scaler_ft_1.pkl'

#Load image clustering model
model_img = load_image_clustering_model(model_path_img)

#Load tabular clustering model
model_tab = load_tabular_model(model_path_tab)

#Load scaler
with open(scaler_path, 'rb') as f:
    scaler = pickle.load(f)

In [5]:
G_tab = nx.Graph()
G_img = nx.Graph()

In [6]:
df = pd.read_csv('train_hm_tab_cluster.csv')
df = df.drop(columns=['prod_name','product_code','department_no', 'index_code', 	
       'product_type_name', 'product_group_name',
       'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc'])

In [7]:
from dataset import myDataset
from torch.utils.data import DataLoader

get_preprocessed_image = True
batch_size = 64
train_test_split = 0.9
my_path_hm = os.path.join(os.getcwd(), 'data/h&mdataset/images/')
my_path_fash = os.path.join(os.getcwd(), 'data/fashion-dataset/images/')

dataset = myDataset(my_path_hm, my_path_fash, get_preprocessed_image, 'hm')
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=10)

Skipping .DS_Store as it is not a jpg file


In [None]:
get_preprocessed_image = True
train_test_split = 0.9
my_path_hm = os.path.join(os.getcwd(), 'data/h&mdataset/images/')
my_path_fash = os.path.join(os.getcwd(), 'data/fashion-dataset/images/')

dataset = myDataset(my_path_hm, my_path_fash, get_preprocessed_image, 'hm')

#Get the indices from the file
train_subset_indices = pd.read_csv('data/h&mdataset/train_subset_indices.pt')
val_subset_indices = pd.read_csv('data/h&mdataset/val_subset_indices.pt')
test_subset_indices = pd.read_csv('data/h&mdataset/test_subset_indices.pt')

test_subset_indices = torch.cat((test_subset_indices, val_subset_indices), 0)

train_dataset = torch.utils.data.Subset(dataset, train_subset_indices)
test_dataset = torch.utils.data.Subset(dataset, test_subset_indices)

dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=10)

In [8]:
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df_scaled['cluster'] = model_tab.predict(df_scaled)

#Add first the cluster nodes
clusters = df_scaled['cluster'].unique()

for cluster in tqdm(clusters):
    G_tab.add_node(cluster, type='cluster')

100%|██████████| 400/400 [00:00<00:00, 981123.74it/s]


In [9]:
#Add the article nodes
for i in tqdm(range(len(df_scaled))):
    name_article = dataset.get_name_img(i)
    G_tab.add_node(i, type='article', cluster=df_scaled['cluster'][i], name=name_article)

100%|██████████| 105542/105542 [00:00<00:00, 286622.85it/s]


In [10]:
#Add the edges
for i in tqdm(range(len(df_scaled))):
    name_article = dataset.get_name_img(train_subset_indices[i])
    cluster = df_scaled['cluster'].iloc[i]
    G_tab.add_edge(cluster, i)

100%|██████████| 105542/105542 [00:00<00:00, 160518.22it/s]


In [11]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using CUDA')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    print('Using MPS')
else :
    device = torch.device('cpu')
    print('Using CPU')

Using MPS


In [12]:
#We want to use a resnet50 from torchvision to have the embedding of an image, use a pretrained resnet and remove the last layer

class ResNet(nn.Module):
    def __init__(self, num_classes=132):
        super(ResNet, self).__init__()
        self.resnet = torch.hub.load('pytorch/vision:v0.6.0', 'resnet50', pretrained=True)
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-1])
        self.fc = nn.Linear(2048, num_classes)
        self.resnet.eval()
        for param in self.resnet.parameters():
            param.requires_grad = False
        for param in self.fc.parameters():
            param.requires_grad = False

    def forward(self, x):
        return self.resnet(x)

In [13]:
resnet_model = ResNet()
model_ft_path = 'models/resnet_finetuned_ft_1.pth'
if os.path.exists(model_ft_path):
    resnet_model.load_state_dict(torch.load(model_ft_path))
    print('ResNet model loaded')
else:
    print('No fintuned model found, Resnet pretrained model will be used')

resnet_model.to(device)

Using cache found in /Users/jskaf/.cache/torch/hub/pytorch_vision_v0.6.0


ResNet model loaded


ResNet(
  (resnet): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(64, 25

In [14]:
class ClusteringModel(nn.Module):
    def __init__(self, embedding_size, num_clusters):
        super(ClusteringModel, self).__init__()
        self.num_embeddings = embedding_size
        self.num_clusters = num_clusters
        self.centers = nn.Parameter(torch.randn(num_clusters, embedding_size))

    def forward(self, embeddings):
        embeddings = embeddings.flatten(2).flatten(1)
        # Compute the distance between each embedding and each cluster center
        distances = torch.cdist(embeddings, self.centers)
        # Assign each embedding to the closest cluster
        assignments = torch.argmin(distances, dim=1)
        return assignments

In [15]:
embedding_size = 2048
num_clusters = 200
model_img = ClusteringModel(embedding_size, num_clusters).to(device)

In [16]:
#Add first the cluster nodes
num_clusers = model_img.num_clusters

for cluster in tqdm(range(num_clusers)):
    G_img.add_node(cluster, type='cluster')

100%|██████████| 200/200 [00:00<00:00, 708497.30it/s]


In [17]:
#Create a dataframe with as first column the image name and as second column the cluster assigned

df_img = {'image_name': [], 'cluster': []}
for batch in tqdm(dataloader):
    imgs, idx, _ = batch
    imgs = imgs.to(device)
    embeddings = resnet_model(imgs)
    assignments = model_img(embeddings)
    for j in range(len(assignments)):
        df_img['image_name'].append(dataset.get_name_img(train_subset_indices[idx[j]]))
        df_img['cluster'].append(assignments[j].item())

100%|██████████| 1643/1643 [09:07<00:00,  3.00it/s]


In [18]:
#Save the dataframe
df_img = pd.DataFrame(df_img)
df_img.to_csv('data/image_cluster.csv', index=False)

In [19]:
#Add the article nodes
for i in tqdm(range(len(df_img))):
    name_article = df_img['image_name'].iloc[i]
    G_img.add_node(i, type='article', cluster=df_img['cluster'][i], name=name_article)

100%|██████████| 105099/105099 [00:00<00:00, 135736.16it/s]


In [20]:
#Add the edges
for i in tqdm(range(len(df_img))):
    name_article = df_img['image_name'].iloc[i]
    cluster = df_img['cluster'][i]
    G_img.add_edge(cluster, f"article {name_article}")

100%|██████████| 105099/105099 [00:00<00:00, 107459.69it/s]


In [21]:
#Save the graphs
nx.write_gexf(G_tab, "models/graph_tab.gexf")
nx.write_gexf(G_img, "models/graph_img.gexf")

In [22]:
#Load the graphs
G_tab = nx.read_gexf("models/graph_tab.gexf")
G_img = nx.read_gexf("models/graph_img.gexf")

In [23]:
#Compute the number of edges

print(f"Number of edges in the tabular graph: {G_tab.number_of_edges()}")
print(f"Number of edges in the image graph: {G_img.number_of_edges()}")
print(f"Number of nodes in the tabular graph: {G_tab.number_of_nodes()}")
print(f"Number of nodes in the image graph: {G_img.number_of_nodes()}")

Number of edges in the tabular graph: 105542
Number of edges in the image graph: 105099
Number of nodes in the tabular graph: 105542
Number of nodes in the image graph: 210198


In [24]:
def calcul_spectre_graphe(graph):
    """
    Calcul du spectre d'un graphe
    :param graph: un graphe
    :return: le spectre
    """
    #get the adjancy matrix as scipy sparse matrix
    adj = nx.to_scipy_sparse_array(graph)
    #transform the type of data into float
    adj = adj.astype(float)
    #get the eigenvalues
    _, s, _ = svds(adj)
    return s

In [25]:
def noyau_spectral(graph1, graph2, sigma):
    """
    Noyau spectral entre deux graphes
    :param graph1: le premier graphe
    :param graph2: le deuxième graphe
    :param sigma: l'hyperparamètre du noyau Gaussien (lien avec la variance)
    :return: le produit scalaire entre les deux graphes
    """
    s1 = calcul_spectre_graphe(graph1)
    s2 = calcul_spectre_graphe(graph2)
    diff = s1.shape[0] - s2.shape[0]
    if diff < 0:
        s1 = np.pad(s1, (0, -diff))
    elif diff > 0:
        s2 = np.pad(s2, (0, diff))
    noyau = np.exp(-np.linalg.norm(s1 - s2) ** 2.0 / sigma ** 2.0)
    return noyau

In [26]:
print(f"Noyau spectral du graphe tabulaire sur lui-même: {noyau_spectral(G_tab, G_tab, 1.0)}")

Noyau spectral du graphe tabulaire sur lui-même: 1.0


In [27]:
print(f"Noyau spectral du graphe image sur lui-même: {noyau_spectral(G_img, G_img, 1.0)}")

Noyau spectral du graphe image sur lui-même: 1.0


In [28]:
print(f"Noyau spectral entre les deux graphes: {noyau_spectral(G_tab, G_img, 1.0)}")

Noyau spectral entre les deux graphes: 0.0


In [None]:
#Affichage du graphe tabulaire
import ipysigma 
ipysigma.Sigma(G_tab, node_color='cluster', edge_color='black')