In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
from os import listdir
from os.path import exists

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [2]:
from sklearn.cluster import KMeans
from sklearn import manifold, decomposition
from sklearn.metrics.cluster import adjusted_rand_score

In [3]:
import cv2

In [4]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array

In [5]:
path_csv = './data/Flipkart/flipkart_com-ecommerce_sample_1050.csv'
path_img = './data/Flipkart/Images'
path_img_little = './data/Flipkart/ImagesLittles'

In [6]:
data = pd.read_csv(path_csv)

In [7]:
# Récupère le noeud à la profondeur level
# (ou la feuille si level va trop profond)
def get_from_tree(tree, level):
    if len(tree) > level:
        return tree[level]
    else:
        return tree[-1]

In [8]:
data['product_category_list'] = data['product_category_tree'].apply(lambda e : e[2:-2]).str.split(' >> ')

In [9]:
def show_representation(data, cat, limit=None, X_tsne=None, title=''):
    if X_tsne is None:
        tsne = manifold.TSNE(n_components=2, perplexity=30, n_iter=2000, init='random')
        X_tsne = tsne.fit_transform(data)
    
    df_tsne = pd.DataFrame(X_tsne, columns=['tsne1', 'tsne2'])
    df_tsne["class"] = cat.reset_index(drop=True)
    
    plt.figure(figsize=(8,5))
    sns.scatterplot(
        x="tsne1", y="tsne2",
        hue="class",
        palette=sns.color_palette('tab10', n_colors=nb_cat_little), s=50,
        data=df_tsne,
        legend="brief")

    plt.title(title, fontsize = 14, pad = 35, fontweight = 'bold')
    plt.xlabel('tsne1', fontsize = 14, fontweight = 'bold')
    plt.ylabel('tsne2', fontsize = 14, fontweight = 'bold')
    plt.legend(prop={'size': 8}) 

    plt.show()
    
    return X_tsne

In [10]:
level = -1

In [11]:
data_final = data[['image', 'product_category_list']].copy()
data_final.columns = ['img', 'cat_lst']
data_final['img'] = path_img+'/'+data_final['img']

data_final['cat'] = data_final['cat_lst'].apply(lambda tree : get_from_tree(tree, level))

list_labels = list(data_final['cat'].value_counts().index)
data_final['label'] = data_final['cat'].apply(lambda e : list_labels.index(e))

In [12]:
nb_cat_little = (data_final['cat'].value_counts() > 1).sum()
nb_cat = len(list_labels)

In [13]:
print("Nombre de catégories total :", nb_cat)
print("Nombre de catégories de plus de 1 élément :", nb_cat_little)

Nombre de catégories total : 642
Nombre de catégories de plus de 1 élément : 96


In [14]:
data_little = data_final.loc[data_final['cat'].isin(list_labels[:nb_cat_little])].copy().reset_index()

In [15]:
data_little

Unnamed: 0,index,img,cat_lst,cat,label
0,8,./data/Flipkart/Images/41384da51732c0b4df3de8f...,"[Watches, Wrist Watches, Alfajr Wrist Watches]",Alfajr Wrist Watches,75
1,18,./data/Flipkart/Images/62c35de4df27437d3597963...,"[Watches, Wrist Watches, Franck Bella Wrist Wa...",Franck Bella Wrist Watches,57
2,20,./data/Flipkart/Images/d7ca2b06b4f4d3b98c61c37...,"[Watches, Wrist Watches, Skmei Wrist Watches]",Skmei Wrist Watches,62
3,21,./data/Flipkart/Images/f74476ba64365d97a5be7d3...,"[Watches, Wrist Watches, Now Wrist Watches]",Now Wrist Watches,61
4,22,./data/Flipkart/Images/fda50982a672ef8835de5c1...,"[Watches, Wrist Watches, Franck Bella Wrist Wa...",Franck Bella Wrist Watches,57
...,...,...,...,...,...
499,1044,./data/Flipkart/Images/be0f39341d771aac5708497...,"[Baby Care, Baby & Kids Gifts, Stickers, Wallm...",Wallmantra Stickers,30
500,1046,./data/Flipkart/Images/fd6cbcc22efb6b761bd564c...,"[Baby Care, Baby & Kids Gifts, Stickers, Wallm...",Wallmantra Stickers,30
501,1047,./data/Flipkart/Images/5912e037d12774bb73a2048...,"[Baby Care, Baby & Kids Gifts, Stickers, Uberl...",Uberlyfe Stickers,91
502,1048,./data/Flipkart/Images/c3edc504d1b4f0ba6224fa5...,"[Baby Care, Baby & Kids Gifts, Stickers, Wallm...",Wallmantra Stickers,30


# Mon CODE

In [16]:
input_shape = VGG16().input_shape
input_shape

(None, 224, 224, 3)

In [17]:
base_model = VGG16(include_top=False, input_shape=input_shape[1:])
base_model.trainable = False

In [18]:
base_model.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

## Conversion

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
from tensorflow.keras.utils import to_categorical

In [21]:
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

In [22]:
def preprocess_image(path):
    image = load_img(path, target_size=input_shape[1:3])
    image = img_to_array(image)
    image = preprocess_input(image)
    return image

In [23]:
data_X = list(data_little["img"].apply(preprocess_image))
data_X = np.asarray(data_X).astype(np.float32)

In [24]:
data_y = to_categorical(data_little['label'], num_classes=nb_cat_little)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.33, stratify=data_y)

In [26]:
flatten_layer = layers.Flatten()
dense_layer_1 = layers.Dense(nb_cat_little*10, activation='relu')
drop_out = layers.Dropout(0.2)
dense_layer_2 = layers.Dense(nb_cat_little*4, activation='relu')
prediction_layer = layers.Dense(nb_cat_little, activation='softmax')


model = models.Sequential([
    base_model,
    flatten_layer,
    dense_layer_1,
    dense_layer_2,
    drop_out,
    prediction_layer
])

# Dropout(0.5)(x)

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vgg16 (Functional)          (None, 7, 7, 512)         14714688  
                                                                 
 flatten (Flatten)           (None, 25088)             0         
                                                                 
 dense (Dense)               (None, 960)               24085440  
                                                                 
 dense_1 (Dense)             (None, 384)               369024    
                                                                 
 dropout (Dropout)           (None, 384)               0         
                                                                 
 dense_2 (Dense)             (None, 96)                36960     
                                                                 
Total params: 39,206,112
Trainable params: 24,491,424
No

In [None]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)


es = EarlyStopping(monitor='val_accuracy',  min_delta = 0.02, mode='max', patience=4, restore_best_weights=True)
es = EarlyStopping(monitor='val_loss', min_delta = 0.02, mode='min', patience=4, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=20, validation_split=0.2, batch_size=32, callbacks=[es])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

In [None]:
def cat_to_label(l):
    return np.argmax(l)

In [None]:
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score

In [None]:
pred_train = model.predict(X_train)

In [None]:
balanced_accuracy_score([cat_to_label(e) for e in pred_train], [cat_to_label(e) for e in y_train])

In [None]:
pred_test = model.predict(X_test)

In [None]:
balanced_accuracy_score([cat_to_label(e) for e in y_test], [cat_to_label(e) for e in pred_test])

In [None]:
from plot_keras_history import show_history, plot_history
import matplotlib.pyplot as plt

In [None]:
show_history(history)
plot_history(history, path="standard.png")
plt.close()

# Data Augmentation

In [None]:
tmp = data_final['cat'].value_counts()

In [None]:
fig, ax = plt.subplots()
sns.histplot(tmp, shrink=.8, discrete=True, ax=ax)
ax.set_xlim(0,max(tmp)+1)
plt.show()