In [1]:
#Load clustering models for tabular and image data

import os
import numpy as np
import pandas as pd
import pickle
import torch
from tqdm import tqdm
import torch.nn as nn
import networkx as nx
#Svd from scipy for sparse matrix
from scipy.sparse.linalg import svds

In [2]:
#Disable warnings
import warnings
warnings.filterwarnings('ignore')


In [3]:
#Load tabular data clustering model using joblib
def load_tabular_model(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    return model

#Load image data clustering model
def load_image_clustering_model(model_path):
    model = torch.load(model_path)
    return model

In [4]:
model_path_img = 'models/clustering_model_img_ft_1.pth'
model_path_tab = 'models/kmeans_model_ft_1.pkl'
scaler_path = 'models/scaler_ft_1.pkl'

#Load image clustering model
model_img = load_image_clustering_model(model_path_img)

#Load tabular clustering model
model_tab = load_tabular_model(model_path_tab)

#Load scaler
with open(scaler_path, 'rb') as f:
    scaler = pickle.load(f)

In [5]:
G_tab = nx.Graph()
G_img = nx.Graph()

In [6]:
df = pd.read_csv('train_hm_tab_cluster.csv')

In [7]:
df

Unnamed: 0.1,Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,index_group_no,section_no,garment_group_no,cluster,img_name
0,58344,719601001,254,1010010,7,1,12,1,2,1005,396,071/0719601001.jpg
1,14767,564312010,286,1010001,72,2,2,3,26,1017,334,056/0564312010.jpg
2,15689,567739002,262,1010016,73,4,2,4,45,1007,11,056/0567739002.jpg
3,37069,652418002,304,1010016,51,3,4,4,79,1021,332,065/0652418002.jpg
4,19199,582789004,265,1010016,63,4,6,1,18,1003,386,058/0582789004.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...
7995,21427,594177005,272,1010016,51,1,4,4,44,1002,31,059/0594177005.jpg
7996,3984,472064011,256,1010001,6,1,12,4,44,1002,378,047/0472064011.jpg
7997,46517,686022001,87,1010016,9,4,5,1,64,1020,188,068/0686022001.jpg
7998,34435,641855001,302,1010016,10,3,9,4,79,1021,315,064/0641855001.jpg


In [8]:
from dataset import myDataset
from torch.utils.data import DataLoader

get_preprocessed_image = True
batch_size = 64
train_test_split = 0.9
my_path_hm = os.path.join(os.getcwd(), 'data/h&mdataset/images/')
my_path_fash = os.path.join(os.getcwd(), 'data/fashion-dataset/images/')

dataset = myDataset(my_path_hm, my_path_fash, get_preprocessed_image, 'hm')
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=10)

Skipping .DS_Store as it is not a jpg file


In [9]:
get_preprocessed_image = True
train_test_split = 0.9
my_path_hm = os.path.join(os.getcwd(), 'data/h&mdataset/images/')
my_path_fash = os.path.join(os.getcwd(), 'data/fashion-dataset/images/')

dataset = myDataset(my_path_hm, my_path_fash, get_preprocessed_image, 'hm')

#Get the indices from the file
train_subset_indices = torch.load('data/h&mdataset/train_subset_indices.pt')
val_subset_indices = torch.load('data/h&mdataset/val_subset_indices.pt')
test_subset_indices = torch.load('data/h&mdataset/test_subset_indices.pt')

test_subset_indices = torch.cat((test_subset_indices, val_subset_indices), 0)

train_dataset = torch.utils.data.Subset(dataset, train_subset_indices)
test_dataset = torch.utils.data.Subset(dataset, test_subset_indices)

dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=10)

Skipping .DS_Store as it is not a jpg file


In [10]:
#Add first the cluster nodes
clusters = df['cluster'].unique()

for cluster in tqdm(clusters):
    G_tab.add_node(cluster, type='cluster')

100%|██████████| 400/400 [00:00<00:00, 864359.40it/s]


In [11]:
#Add the article nodes
for i in tqdm(range(len(df))):
    name_article = df['img_name'].iloc[i]
    G_tab.add_node(name_article, type='article', cluster=df['cluster'].iloc[i])

100%|██████████| 8000/8000 [00:00<00:00, 105166.53it/s]


In [12]:
#Add the edges
for i in tqdm(range(len(df))):
    name_article = df['img_name'].iloc[i]
    cluster = df['cluster'].iloc[i]
    G_tab.add_edge(cluster, name_article)

100%|██████████| 8000/8000 [00:00<00:00, 110420.73it/s]


In [13]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using CUDA')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    print('Using MPS')
else :
    device = torch.device('cpu')
    print('Using CPU')

Using MPS


In [14]:
#Add first the cluster nodes
num_clusers = 200

for cluster in tqdm(range(num_clusers)):
    G_img.add_node(cluster, type='cluster')

100%|██████████| 200/200 [00:00<00:00, 876552.56it/s]


In [15]:
df_img = pd.read_csv('train_predictions_img.csv')

In [16]:
df_img

Unnamed: 0,image,cluster
0,054/0549263001.jpg,129
1,051/0510461001.jpg,129
2,053/0537631002.jpg,129
3,085/0854830002.jpg,129
4,060/0600768003.jpg,129
...,...,...
7995,072/0726172001.jpg,129
7996,091/0918642002.jpg,129
7997,063/0637858002.jpg,129
7998,090/0909823002.jpg,129


In [17]:
#Add the article nodes
for i in tqdm(range(len(df_img))):
    name_article = df_img['image'].iloc[i]
    G_img.add_node(i, type='article', cluster=df_img['cluster'][i])

100%|██████████| 8000/8000 [00:00<00:00, 117217.86it/s]


In [18]:
#Add the edges
for i in tqdm(range(len(df_img))):
    name_article = df_img['image'].iloc[i]
    cluster = df_img['cluster'][i]
    G_img.add_edge(cluster, f"article {name_article}")

100%|██████████| 8000/8000 [00:00<00:00, 52378.63it/s]


In [19]:
#Save the graphs
nx.write_gexf(G_tab, "models/graph_tab.gexf")
nx.write_gexf(G_img, "models/graph_img.gexf")

In [20]:
#Load the graphs
G_tab = nx.read_gexf("models/graph_tab.gexf")
G_img = nx.read_gexf("models/graph_img.gexf")

In [21]:
#Compute the number of edges

print(f"Number of edges in the tabular graph: {G_tab.number_of_edges()}")
print(f"Number of edges in the image graph: {G_img.number_of_edges()}")
print(f"Number of nodes in the tabular graph: {G_tab.number_of_nodes()}")
print(f"Number of nodes in the image graph: {G_img.number_of_nodes()}")

Number of edges in the tabular graph: 8000
Number of edges in the image graph: 8000
Number of nodes in the tabular graph: 8400
Number of nodes in the image graph: 16000


In [22]:
def calcul_spectre_graphe(graph):
    """
    Calcul du spectre d'un graphe
    :param graph: un graphe
    :return: le spectre
    """
    #get the adjancy matrix as scipy sparse matrix
    adj = nx.to_scipy_sparse_array(graph)
    #transform the type of data into float
    adj = adj.astype(float)
    #get the eigenvalues
    _, s, _ = svds(adj)
    return s

In [23]:
def noyau_spectral(graph1, graph2, sigma):
    """
    Noyau spectral entre deux graphes
    :param graph1: le premier graphe
    :param graph2: le deuxième graphe
    :param sigma: l'hyperparamètre du noyau Gaussien (lien avec la variance)
    :return: le produit scalaire entre les deux graphes
    """
    s1 = calcul_spectre_graphe(graph1)
    s2 = calcul_spectre_graphe(graph2)
    diff = s1.shape[0] - s2.shape[0]
    if diff < 0:
        s1 = np.pad(s1, (0, -diff))
    elif diff > 0:
        s2 = np.pad(s2, (0, diff))
    noyau = np.exp(-np.linalg.norm(s1 - s2) ** 2.0 / sigma ** 2.0)
    return noyau

In [24]:
print(f"Noyau spectral du graphe tabulaire sur lui-même: {noyau_spectral(G_tab, G_tab, 1.0)}")

Noyau spectral du graphe tabulaire sur lui-même: 1.0


In [25]:
print(f"Noyau spectral du graphe image sur lui-même: {noyau_spectral(G_img, G_img, 1.0)}")

Noyau spectral du graphe image sur lui-même: 1.0


In [26]:
print(f"Noyau spectral entre les deux graphes: {noyau_spectral(G_tab, G_img, 1.0)}")

Noyau spectral entre les deux graphes: 0.0


In [27]:
#Affichage du graphe tabulaire
import ipysigma 
ipysigma.Sigma(G_tab, node_color='cluster', edge_color='black')

Sigma(nx.Graph with 8,400 nodes and 8,000 edges)