In [1]:
#Load clustering models for tabular and image data

import os
import numpy as np
import pandas as pd
import pickle
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import torch
from tqdm import tqdm
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

  from pandas.core import (


In [None]:
#Load tabular data clustering model
def load_tabular_model(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    return model

#Load image data clustering model
def load_image_clustering_model(model_path):
    model = torch.load(model_path)
    return model

In [None]:
model_path_img = 'models/image_clustering_model.pth'
model_path_tab = 'models/kmeans_model.pkl'
scaler_path = 'models/scaler.pkl'

#Load image clustering model
model_img = load_image_clustering_model(model_path_img)

#Load tabular clustering model
model_tab = load_tabular_model(model_path_tab)

#Load scaler
with open(scaler_path, 'rb') as f:
    scaler = pickle.load(f)

In [None]:
import networkx as nx

G_tab = nx.Graph()
G_img = nx.Graph()

In [None]:
df = pd.read_csv('data/h&mdataset/articles.csv')
df = df.drop(columns=['prod_name','product_code','product_type_no','department_no', 'index_code', 	
       'product_type_name', 'product_group_name',
       'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc'])

In [None]:
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df_scaled['cluster'] = model_tab.predict(df_scaled)

#Add first the cluster nodes
clusters = df_scaled['cluster'].unique()

for cluster in tqdm(clusters):
    G_tab.add_node(cluster, type='cluster')



In [None]:
#Add the article nodes
for i in tqdm(range(len(df_scaled))):
    name_article = df_scaled['article_id'].iloc[i]
    G_tab.add_node(f"article {name_article}", type='article', cluster=df_scaled['cluster'][i])

In [None]:
#Add the edges
for i in tqdm(range(len(df_scaled))):
    name_article = df_scaled['article_id'].iloc[i]
    cluster = df_scaled['cluster'][i]
    G_tab.add_edge(cluster, f"article {name_article}")

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using CUDA')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    print('Using MPS')
else :
    device = torch.device('cpu')
    print('Using CPU')

In [None]:
from dataset import myDataset

get_preprocessed_image = True
batch_size = 64
train_test_split = 0.9
my_path_hm = os.path.join(os.getcwd(), 'data/h&mdataset/images/')
my_path_fash = os.path.join(os.getcwd(), 'data/fashion-dataset/images/')

dataset = myDataset(my_path_hm, my_path_fash, get_preprocessed_image, 'hm')
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=8)

In [None]:
#We want to use a resnet50 from torchvision to have the embedding of an image, use a pretrained resnet and remove the last layer

class ResNet(nn.Module):
    def __init__(self):
        super(ResNet, self).__init__()
        self.resnet = torch.hub.load('pytorch/vision:v0.6.0', 'resnet50', pretrained=True)
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-1])
        self.resnet.eval()
        for param in self.resnet.parameters():
            param.requires_grad = False

    def forward(self, x):
        return self.resnet(x)

In [None]:
resnet_model = ResNet()

In [None]:
#Add first the cluster nodes
num_clusers = model_img.num_clusters

for cluster in tqdm(range(num_clusers)):
    G_img.add_node(cluster, type='cluster')

In [None]:
#Create a dataframe with as first column the image name and as second column the cluster assigned

df_img = pd.DataFrame(columns=['image_name', 'cluster'])
for i, batch in tqdm(enumerate(dataloader)):
    embeddings = resnet_model(batch)
    assignments = model_img(embeddings)
    for j in range(len(assignments)):
        df_img = df_img.append({'image_name': dataset.get_image_name(i*batch_size+j), 'cluster': assignments[j]}, ignore_index=True)

#Save the dataframe
df_img.to_csv('data/image_cluster.csv', index=False)

In [None]:
#Add the article nodes
for i in tqdm(range(len(df_img))):
    name_article = df_img['image_name'].iloc[i]
    G_img.add_node(f"article {name_article}", type='article', cluster=df_img['cluster'][i])

In [None]:
#Add the edges
for i in tqdm(range(len(df_img))):
    name_article = df_img['image_name'].iloc[i]
    cluster = df_img['cluster'][i]
    G_img.add_edge(cluster, f"article {name_article}")

In [None]:
#Save the graphs
nx.write_gpickle(G_tab, 'data/tabular_graph.gpickle')
nx.write_gpickle(G_img, 'data/image_graph.gpickle')

In [None]:
#Compute the number of edges

print(f"Number of edges in the tabular graph: {G_tab.number_of_edges()}")
print(f"Number of edges in the image graph: {G_img.number_of_edges()}")
print(f"Number of nodes in the tabular graph: {G_tab.number_of_nodes()}")
print(f"Number of nodes in the image graph: {G_img.number_of_nodes()}")

In [None]:
#Compare the two graphs using the Jaccard similarity

def jaccard_similarity(g, h):
    i = set(g).intersection(h)
    return round(len(i) / (len(g) + len(h) - len(i)),3)

jaccard_similarity(G_img.edges(), G_tab.edges())