In [1]:
import sys
import os

if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive/', force_remount=True)
    %cd drive/MyDrive/nov23_bds_rakuten
else:
    root_files = os.listdir()
    if not "requirements.txt" in root_files:
        print("The jupyter server root should be set to root of git project")
        print("If you're on vscode add the following to your settings.json file")
        print('"jupyter.notebookFileRoot": "${workspaceFolder}"')

Mounted at /content/drive/
/content/drive/MyDrive/nov23_bds_rakuten


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from pathlib import Path
import re
from tqdm import tqdm

import zipfile

from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.utils import to_categorical
from keras.models import load_model

random_state = 42
#image_shape = (500,500,3)
image_shape = (150,150,1)
output_dir = "output/data-modeling-images-1"

In [3]:
if 'google.colab' in sys.modules:
    image_path = "/tmp/rakuten-images"
    image_dir = Path(image_path)
    if image_dir.exists():
        print("images already imported from drive")
    else:
        print("extracting images from drive into colab's session storage... (about 1 min)")
        with zipfile.ZipFile("data/images.zip","r") as images_zip:
            images_zip.extractall(image_path)
        image_path = image_path + "/images"
else:
    image_path = "data/images"

extracting images from drive into colab's session storage... (about 1 min)


In [4]:
# test
files = os.listdir(image_path)
print(len(files))
files[:50]

84916


['image_1237460458_product_1251777484.jpg',
 'image_1276353568_product_4012749713.jpg',
 'image_1170974908_product_2591520083.jpg',
 'image_1195284677_product_3160038615.jpg',
 'image_1182061562_product_3005332310.jpg',
 'image_1010147324_product_444794370.jpg',
 'image_1082507947_product_1453337922.jpg',
 'image_1301366750_product_4150982911.jpg',
 'image_882889974_product_127382224.jpg',
 'image_1305082209_product_4164662789.jpg',
 'image_1261467619_product_3900386555.jpg',
 'image_1266052526_product_3930178166.jpg',
 'image_1114671608_product_1935687078.jpg',
 'image_1311058355_product_4182118122.jpg',
 'image_1198175320_product_3212822450.jpg',
 'image_1116847831_product_1965615444.jpg',
 'image_1110790921_product_1904327747.jpg',
 'image_903745357_product_148366260.jpg',
 'image_1128093839_product_2137064077.jpg',
 'image_1249613600_product_3660925403.jpg',
 'image_550164225_product_53549174.jpg',
 'image_1131264979_product_2199290536.jpg',
 'image_1027296457_product_570635975.jpg

# preprocessing

In [5]:
df = pd.read_csv("output/data-exploration/X_train_pre.csv")
df.head()

Unnamed: 0,index,productid,imageid,prdtypecode,text,lang
0,0,3804725264,1263597046,10,Olivia: Personalisiertes Notizbuch / 150 Seite...,de
1,1,436067568,1008141237,2280,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,fr
2,2,201115110,938777978,50,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,fr
3,3,50418756,457047496,1280,Peluche Donald - Europe - Disneyland 2000 (Mar...,de
4,4,278535884,1077757786,2705,La Guerre Des Tuques - Luc a des idées de gran...,fr


In [6]:


files = os.listdir(image_path)
#Path(output_dir).mkdir(parents=True, exist_ok=True)
#lang_file = Path(lang_file_path)
#if lang_file.exists():

r = re.compile(r"^image_(\d+)_product_(\d+)\.\w+$")
for file in tqdm(files):
    #print(file)
    r_result = r.search(file)
    if r_result == None:
        continue
    (imageid,productid) = r_result.groups()
    c = df[(df['imageid']==int(imageid)) & (df['productid']==int(productid))]['prdtypecode']
    #print("c ",c)
    if len(c) == 0:
        print("Error: record not found for file:",file)
        continue
    if len(c) != 1:
        print("Warning: multiple records found for file",file)
        c = pd.Series(c.unique())
        if len(c) != 1:
            print("Error: multiple records with different classes found for file",file)
            continue
    c = str(c.values[0])
    #print("mkdir ",image_path+"/"+c)
    Path(image_path+"/"+c).mkdir(exist_ok=True)
    #print("rename ",image_path+"/"+file," to ",image_path+"/"+c+"/"+file)
    os.rename(image_path+"/"+file,image_path+"/"+c+"/"+file)



100%|██████████| 84916/84916 [01:13<00:00, 1153.81it/s]


In [7]:
# test
img = plt.imread(image_path+"/10/image_56822330_product_355004.jpg")
img.shape

(500, 500, 3)

# modeling

In [8]:
# reduce df size for testing

#df = df.iloc[:5000]
#print(df["prdtypecode"].nunique())
#df.head()

In [9]:
df["imagefile"] = df["prdtypecode"].astype("str") + "/image_" + df["imageid"].astype("str") \
    + "_product_" + df["productid"].astype("str") + ".jpg"
df.head()

Unnamed: 0,index,productid,imageid,prdtypecode,text,lang,imagefile
0,0,3804725264,1263597046,10,Olivia: Personalisiertes Notizbuch / 150 Seite...,de,10/image_1263597046_product_3804725264.jpg
1,1,436067568,1008141237,2280,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,fr,2280/image_1008141237_product_436067568.jpg
2,2,201115110,938777978,50,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,fr,50/image_938777978_product_201115110.jpg
3,3,50418756,457047496,1280,Peluche Donald - Europe - Disneyland 2000 (Mar...,de,1280/image_457047496_product_50418756.jpg
4,4,278535884,1077757786,2705,La Guerre Des Tuques - Luc a des idées de gran...,fr,2705/image_1077757786_product_278535884.jpg


In [10]:

X = df["imagefile"]
y = df["prdtypecode"].astype("str") # keras requires string classes
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=random_state)


In [11]:
X_train, X_validation,y_train,y_validation = train_test_split(
    X_train,y_train,test_size=0.2,random_state=random_state)

In [12]:
classes = df["prdtypecode"].unique()
sorted_classes_str = np.sort(np.array(classes,dtype="str"))
sorted_classes = sorted_classes_str.astype("int")
classes_keras = list(sorted_classes_str)


In [13]:
train_data_generator = ImageDataGenerator(
    # test resnet
    #rescale=1./255
    #shear_range=0.2,zoom_range=0.2,horizontal_flip=True
)

batch_size=32
target_size = image_shape[:2]
color_mode = "grayscale"
# test resnet
color_mode = "rgb"
generator_classes = list(sorted_classes.astype("str"))

train_generator = train_data_generator.flow_from_dataframe(
    dataframe=pd.DataFrame({"imagefile": X_train,"prdtypecode": y_train}),
    x_col="imagefile",
    y_col="prdtypecode",
    directory=image_path,
    target_size=target_size,
    color_mode=color_mode,
    batch_size=batch_size,
    class_mode="categorical",
    classes=classes_keras
)





Found 54345 validated image filenames belonging to 27 classes.


In [14]:
print(sorted_classes)
print(train_generator.class_indices)

[  10 1140 1160 1180 1280 1281 1300 1301 1302 1320 1560 1920 1940 2060
 2220 2280 2403 2462 2522 2582 2583 2585 2705 2905   40   50   60]
{'10': 0, '1140': 1, '1160': 2, '1180': 3, '1280': 4, '1281': 5, '1300': 6, '1301': 7, '1302': 8, '1320': 9, '1560': 10, '1920': 11, '1940': 12, '2060': 13, '2220': 14, '2280': 15, '2403': 16, '2462': 17, '2522': 18, '2582': 19, '2583': 20, '2585': 21, '2705': 22, '2905': 23, '40': 24, '50': 25, '60': 26}


In [15]:
validation_data_generator = ImageDataGenerator(
#    rescale=1./255
)

validation_generator = validation_data_generator.flow_from_dataframe(
    dataframe=pd.DataFrame({"imagefile": X_validation,"prdtypecode": y_validation}),
    x_col="imagefile",
    y_col="prdtypecode",
    directory=image_path,
    target_size=target_size,
    color_mode=color_mode,
    batch_size=batch_size,
    class_mode="categorical",
    classes=classes_keras
)

Found 13587 validated image filenames belonging to 27 classes.


In [16]:
print(sorted_classes)
print(validation_generator.class_indices)

[  10 1140 1160 1180 1280 1281 1300 1301 1302 1320 1560 1920 1940 2060
 2220 2280 2403 2462 2522 2582 2583 2585 2705 2905   40   50   60]
{'10': 0, '1140': 1, '1160': 2, '1180': 3, '1280': 4, '1281': 5, '1300': 6, '1301': 7, '1302': 8, '1320': 9, '1560': 10, '1920': 11, '1940': 12, '2060': 13, '2220': 14, '2280': 15, '2403': 16, '2462': 17, '2522': 18, '2582': 19, '2583': 20, '2585': 21, '2705': 22, '2905': 23, '40': 24, '50': 25, '60': 26}


In [17]:
test_data_generator = ImageDataGenerator(
#    rescale=1./255
)

test_generator = test_data_generator.flow_from_dataframe(
    dataframe=pd.DataFrame({"imagefile": X_test,"prdtypecode": y_test}),
    x_col="imagefile",
    y_col="prdtypecode",
    directory=image_path,
    target_size=target_size,
    color_mode=color_mode,
    batch_size=batch_size,
    class_mode="categorical",
    classes=classes_keras,
    shuffle=False
)

Found 16984 validated image filenames belonging to 27 classes.


In [18]:
print(sorted_classes)
print(test_generator.class_indices)

[  10 1140 1160 1180 1280 1281 1300 1301 1302 1320 1560 1920 1940 2060
 2220 2280 2403 2462 2522 2582 2583 2585 2705 2905   40   50   60]
{'10': 0, '1140': 1, '1160': 2, '1180': 3, '1280': 4, '1281': 5, '1300': 6, '1301': 7, '1302': 8, '1320': 9, '1560': 10, '1920': 11, '1940': 12, '2060': 13, '2220': 14, '2280': 15, '2403': 16, '2462': 17, '2522': 18, '2582': 19, '2583': 20, '2585': 21, '2705': 22, '2905': 23, '40': 24, '50': 25, '60': 26}


In [19]:
model_path = output_dir+"/model.keras"
model_path_obj = Path(model_path)
if model_path_obj.exists():
  print("Load previous model from",model_path)
  model = load_model(model_path)
  loaded_model = True
else:
  print("No previous model found at",model_path)
  loaded_model = False

No previous model found at output/data-modeling-images-1/model.keras


In [20]:
from tensorflow.keras import regularizers

if not loaded_model:
  inputs = Input(shape=image_shape)
  layer1 = Conv2D(filters=32,kernel_size=(5,5),padding='valid',activation='relu',
                  input_shape=image_shape)
  layer2 = MaxPooling2D(pool_size=(2,2),padding="valid")
  layer3 = Dropout(rate=0.2)
  layer4 = Flatten()
  layer5 = Dense(units=128,activation="relu")
  layer6 = Dense(units=len(classes_keras),activation="softmax")

  x = layer1(inputs)
  x = layer2(x)
  x = layer3(x)
  x = layer4(x)
  x = layer5(x)
  outputs = layer6(x)

  model = Model(inputs=inputs,outputs=outputs)

In [21]:
# alternative à tester
import tensorflow as tf

model = tf.keras.applications.ResNet50(
    include_top=True,
    weights=None,
    classes=27)

In [22]:
import keras.backend as K

if not loaded_model:
  model.compile(loss='categorical_crossentropy',optimizer="adam",metrics=["accuracy"])

print(K.eval(model.optimizer.lr))

0.001


In [23]:
import scipy as scipy
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

reduce_learning_rate = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    min_delta = 0.01,
    patience=2,
    cooldown= 3,
    mode='auto',
    verbose=1
)

training_history = model.fit(
    train_generator,
    validation_data=validation_generator,
    steps_per_epoch=train_generator.samples // batch_size,
    validation_steps = validation_generator.samples // batch_size,
    #workers=-1,
    epochs=10,
    verbose=1
    #,callbacks=[reduce_learning_rate]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
test_pred = model.predict(test_generator,verbose=1)




In [25]:
from sklearn.metrics import classification_report,f1_score

#test_pred[:3]
print(test_generator.samples)
print(test_pred.shape)
test_pred_class = np.argmax(test_pred,axis=1)
print(test_pred_class.shape)
#test_pred_class
print(y_test.shape)

test_pred_class = [sorted_classes[c] for c in test_pred_class]
print(test_pred_class[:10])
print(y_test.iloc[:10])
print(classification_report(y_test.astype("int"),test_pred_class))
print(f1_score(y_test.astype("int"),test_pred_class,average="weighted"))

16984
(16984, 27)
(16984,)
(16984,)
[1300, 1300, 1560, 1280, 2060, 2522, 10, 2522, 1180, 1300]
36138    2905
68630    1281
36172    2060
9830     1280
28422    2280
25246    1300
9314     2280
24376    2583
39184    2060
16038    2705
Name: prdtypecode, dtype: object
              precision    recall  f1-score   support

          10       0.33      0.17      0.22       612
          40       0.40      0.30      0.34       521
          50       0.06      0.04      0.05       357
          60       0.07      0.27      0.12       161
        1140       0.29      0.02      0.04       539
        1160       0.92      0.52      0.67       786
        1180       0.43      0.07      0.12       146
        1280       0.13      0.17      0.15       961
        1281       0.07      0.09      0.08       424
        1300       0.15      0.71      0.25       974
        1301       0.00      0.00      0.00       169
        1302       0.14      0.05      0.07       507
        1320       0.19      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
save_model = False
if model_path_obj.exists():
  print("Previous model found at ",model_path,". Comparing it to actual model")
  previous_model = load_model(model_path)
  test_pred_prev = previous_model.predict(test_generator,verbose=1)
  test_pred_class_prev = [sorted_classes[c] for c in np.argmax(test_pred_prev,axis=1)]
  f1_score_prev = f1_score(y_test.astype("int"),test_pred_class_prev,average="weighted")
  f1_score_actual = f1_score(y_test.astype("int"),test_pred_class,average="weighted")
  print("f1_score_prev:",f1_score_prev,"f1_score_actual:",f1_score_actual)
  if f1_score_actual > f1_score_prev:
    print("actual model has a best score than previous one")
    save_model = True
  else:
    print("actual model has a worst score than previous one")
    save_model = False
else:
  save_model = True

if save_model == True:
  print("Saving actual model to",model_path)
  Path(output_dir).mkdir(parents=True, exist_ok=True)
  # wait for yaniv response on random_state and test_train_split
  model.save(model_path)

Saving actual model to output/data-modeling-images-1/model.keras


In [27]:
cm = pd.crosstab(y_test.astype("int"),test_pred_class,rownames=["real"],colnames=["predicted"])
display(cm)

predicted,10,40,50,60,1140,1160,1180,1280,1281,1300,...,1940,2060,2280,2403,2462,2522,2582,2583,2585,2705
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,101,32,6,9,0,4,4,16,43,89,...,9,80,2,8,0,184,6,1,1,1
40,9,156,12,34,2,4,1,30,38,129,...,2,26,1,5,0,54,3,0,3,0
50,1,2,14,14,0,0,0,18,6,188,...,0,15,0,1,0,64,2,13,3,0
60,0,0,11,43,0,0,0,12,3,70,...,0,10,0,0,0,5,1,2,0,0
1140,4,16,5,14,13,3,1,67,16,195,...,0,67,2,6,0,73,18,6,2,0
1160,1,23,15,79,2,410,2,15,43,109,...,2,37,1,4,0,24,6,1,0,0
1180,5,6,2,7,4,0,10,12,11,48,...,0,13,0,1,0,16,5,1,1,0
1280,6,6,2,13,5,0,0,168,31,357,...,1,133,1,1,0,122,30,17,10,0
1281,5,11,6,30,1,4,0,87,39,108,...,3,35,0,5,0,56,3,5,2,0
1300,0,2,7,12,0,0,0,31,11,696,...,0,28,0,1,0,123,15,13,15,0


In [28]:
y_test

36138    2905
68630    1281
36172    2060
9830     1280
28422    2280
         ... 
40234    2280
9183     1160
63404    2585
80127    2522
13914    1920
Name: prdtypecode, Length: 16984, dtype: object

In [30]:
import seaborn as sns

pred_count_by_class = df['prdtypecode'].value_counts().reset_index()
print(pred_count_by_class)
pred_count_by_class = pred_count_by_class.rename(columns={"index":"prdtypecode","prdtypecode":"count"})
accuracy = np.array([(cm.loc[c,c]/cm.loc[c].sum()) for c in pred_count_by_class["prdtypecode"]])
pred_count_by_class["accuracy"] = accuracy

#correct_pred_str = " ".join(map(lambda pred: ("("+str(pred[0])+","+str(pred[1])+")"),zip(sorted_classes,correct_pred_count)))
#print(correct_pred_str)
fig = plt.figure(figsize = (16, 6))
sns.barplot(data=pred_count_by_class,x="prdtypecode",y="count",hue="accuracy",order=pred_count_by_class["prdtypecode"])
plt.title('Nombre de produits et accuracy par type')
plt.show()

display(pred_count_by_class)

    index  prdtypecode
0    2583        10209
1    1560         5073
2    1300         5045
3    2060         4993
4    2522         4989
5    1280         4870
6    2403         4774
7    2280         4760
8    1920         4303
9    1160         3953
10   1320         3241
11     10         3116
12   2705         2761
13   1140         2671
14   2582         2589
15     40         2508
16   2585         2496
17   1302         2491
18   1281         2070
19     50         1681
20   2462         1421
21   2905          872
22     60          832
23   2220          824
24   1301          807
25   1940          803
26   1180          764


KeyError: 2905

In [None]:
#from keras.models import load_model

#model = load_model(output_dir+"/model.keras")

In [None]:
#train_generator = train_data_generator.flow_from_dataframe(
#    dataframe=pd.DataFrame({"imagefile": X_train,"prdtypecode": y_train}),
#    x_col="imagefile",
#    y_col="prdtypecode",
#    directory=image_path,
#    target_size=target_size,
#    color_mode=color_mode,
#    batch_size=batch_size,
#    class_mode="categorical",
#    classes=classes_keras,
#    shuffle=False
#)

#train_pred = model.predict(train_generator,verbose=1)

In [None]:
#train_pred.shape

#print(train_generator.class_indices)
#print(sorted_classes)
#print(train_pred[:2])
#train_pred_class_indices = np.argmax(train_pred,axis=1)
#print(train_pred_class_indices[:2])
#train_pred_class = [sorted_classes[c] for c in train_pred_class_indices]
#print(train_pred_class[:2])

In [None]:
#from sklearn.metrics import classification_report

#train_pred_class = np.argmax(train_pred,axis=1)
#train_pred_class = [sorted_classes[c] for c in train_pred_class]
#print(classification_report(y_train.astype("int"),train_pred_class))