In [None]:
import pandas as pd
import torch 
import pickle
from torch import nn
import torchvision
import numpy as np
from torch.utils.data import Dataset, DataLoader
import os

In [None]:
train_img_csv = pd.read_csv('train_predictions_img.csv')
train_txt_csv = pd.read_csv('train_predictions_txt.csv')
train_csv_tab = pd.read_csv('train_hm_tab_cluster.csv')
test_img_csv = pd.read_csv('test_predictions_img.csv')

In [None]:
df = pd.read_csv('data/h&mdataset/articles.csv')
df = df.drop(columns=['article_id','prod_name','product_code','product_type_no','department_no', 'index_code', 	
       'product_type_name', 'product_group_name',
       'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc'])

In [None]:
from dataset import myDataset

get_preprocessed_image = True
train_test_split = 0.9
my_path_hm = os.path.join(os.getcwd(), 'data/h&mdataset/images/')
my_path_fash = os.path.join(os.getcwd(), 'data/fashion-dataset/images/')

dataset = myDataset(my_path_hm, my_path_fash, get_preprocessed_image, 'hm')

In [None]:
#find common points in test img and tab

common_points = pd.merge(test_img_csv, train_csv_tab, on=['image', 'img_name'], how='inner')

In [None]:
class ResNet(nn.Module):
    def __init__(self):
        super(ResNet, self).__init__()
        self.resnet = torch.hub.load('pytorch/vision:v0.6.0', 'resnet50', pretrained=True)
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-1])
        self.resnet.eval()
        for param in self.resnet.parameters():
            param.requires_grad = False

    def forward(self, x):
        return self.resnet(x)
    
resnet_model = ResNet()
preprocess = torchvision.models.ResNet50_Weights.IMAGENET1K_V2.transforms()

In [None]:
#Load tabular data clustering model
def load_tabular_model(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    return model

#Load image data clustering model
def load_image_clustering_model(model_path):
    model = torch.load(model_path)
    return model

In [None]:
model_path_img = 'models/image_clustering_model.pth'
model_path_tab = 'models/kmeans_model.pkl'
scaler_path = 'models/scaler.pkl'

#Load image clustering model
model_img = load_image_clustering_model(model_path_img)

#Load tabular clustering model
model_tab = load_tabular_model(model_path_tab)

#Load scaler
with open(scaler_path, 'rb') as f:
    scaler = pickle.load(f)

In [None]:
#Choose a random common point
common_point = common_points.sample()

#Get its cluster for tabular model

common_point_tab = common_point.drop(['image'], axis=1)
index_common_point = dataset.get_index_from_img_name(common_point['img_name'].values[0])
point_tab = df.iloc[index_common_point]
point_tab = scaler.transform(point_tab)
point_tab_cluster = model_tab.predict(point_tab)

#Get its cluster for image model

common_point_img = common_point.drop(['image'], axis=1)
index_common_point = dataset.get_index_from_img_name(common_point['img_name'].values[0])
point_img = dataset[index_common_point][0].unsqueeze(0)
point_img = preprocess(point_img)
point_img = resnet_model(point_img)
#checker sur les dims
point_img_cluster = model_img.predict(point_img)


In [None]:
#get all the points in the same cluster for tabular model
tabs_cluster = train_csv_tab[train_csv_tab['cluster'] == point_tab_cluster[0]]

#get all the points in the same cluster for image model
imgs_cluster = train_img_csv[train_img_csv['cluster'] == point_img_cluster[0]]

In [None]:
#calculate jaccard similarity

def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

jaccard_similarity(tabs_cluster['img_name'], imgs_cluster['img_name'])

In [None]:
#get index using get_index_from_img_name

index_tab = [dataset.get_index_from_img_name(img_name) for img_name in tabs_cluster['img_name']]
index_img = [dataset.get_index_from_img_name(img_name) for img_name in imgs_cluster['img_name']]

#Get the images and tabular data

imgs = [dataset[i][0] for i in index_img]
tabs = [df.iloc[i] for i in index_tab]


In [None]:
#Do a KNN for the common point on the tabular data

from sklearn.neighbors import NearestNeighbors

neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(tabs)
distances, indices = neigh.kneighbors(point_tab, 5)

#Get the images corresponding to the indices

imgs_knn_tab = [dataset[index_img[i]][0] for i in indices[0]]

In [None]:
#Do a KNN for the common point on the image data

#transform the images into their resnet embeddings

imgs = [preprocess(img) for img in imgs]
imgs = [resnet_model(img.unsqueeze(0)) for img in imgs]
imgs = [img.detach().numpy() for img in imgs]

neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(imgs)
distances, indices = neigh.kneighbors(point_img.detach().numpy(), 5)

#Get the images corresponding to the indices

imgs_knn_img = [dataset[index_img[i]][0] for i in indices[0]]


In [None]:
#Plot the images

import matplotlib.pyplot as plt

fig, axs = plt.subplots(1, 5, figsize=(20, 20))
for i, img in enumerate(imgs_knn_tab):
    axs[i].imshow(img)
    axs[i].axis('off')
plt.show()

In [None]:
#Plot the images

fig, axs = plt.subplots(1, 5, figsize=(20, 20))
for i, img in enumerate(imgs_knn_img):
    axs[i].imshow(img)
    axs[i].axis('off')
plt.show()