# Parte 4: Extracción de características

Para la generación de características se tuvieron en cuenta las dos fuentes de información por producto: 

    1. Textos
    2. Imágenes

Por este motivo, se optó por utilizar un modelo de extracción de características para las imágenes (XCeption) una red convolucional. Y por el lado de la vectorización de las palabras, se importó el modelo de vectorización construido en la **parte 3 del proyecto**

In [2]:
import pandas as pd
import regex
import requests
import unidecode
import numpy as np
import os, glob,joblib,json
import cv2
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
import shutil, sys
import urllib.request 
from wordcloud import WordCloud
import PIL
import tensorflow as tf
import keras
from keras.layers.core import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam,SGD
from pathlib import Path  
from sklearn.preprocessing import LabelEncoder


In [3]:
from IPython.display import Image
from keras.preprocessing import image 
from keras import optimizers 
from keras import layers, models 
from keras.applications.imagenet_utils import preprocess_input
from keras import regularizers 
from keras.preprocessing.image import ImageDataGenerator 

## Importar archivos

In [4]:
CWD = os.getcwd()
filepath = Path(CWD+'/tfidf_vectorizer_cat.pkl') 
tfidf_vectorizer_cat = joblib.load(filepath)

#### Atributos

In [5]:
filepath = Path(CWD+'/dataset_sub_cats.csv')  
df_subcat = pd.read_csv(filepath)
CATEGORIAS_LISTA = list(df_subcat['categoria'].unique())

## Generar características de imágenes con XCeption

In [6]:
def extract_features(directory):
    model=tf.keras.applications.xception.Xception(include_top=False, pooling='avg')
    features = {}
    for (root,dirs,files) in os.walk(directory):
        for img in files: 
            filename = directory + "/" + img
            image = Image.open(filename)
            image = image.resize((299,299))
            image = np.expand_dims(image, axis=0)
            image = image/127.5
            image = image - 1.0
            feature = model.predict(image)
            features[img] = feature
    return features

In [10]:
model=tf.keras.applications.xception.Xception(include_top=False, pooling='avg')
filepath = Path(CWD+'/xception_model.pkl')  
joblib.dump(model,filepath)





INFO:tensorflow:Assets written to: ram://037dc8f7-e20d-44b6-aaca-e254b57d581e/assets


INFO:tensorflow:Assets written to: ram://037dc8f7-e20d-44b6-aaca-e254b57d581e/assets


['/Users/adelaidazuluaga/Documents/AdelaidaZuluaga/xception_model.pkl']

Cada imagen se traduce en un vector de 1x2048

In [116]:
out_cat = {}
for c in tqdm(categorias_lista):
    imdir = f'/Users/adelaidazuluaga/Documents/AdelaidaZuluaga/sub_images__/{c}'
    out_cat[c] = extract_features(imdir)

  0%|                                                                                                                                                                              | 0/32 [00:00<?, ?it/s]









  3%|█████                                                                                                                                                              | 1/32 [06:20<3:16:20, 380.02s/it]





  6%|██████████▏                                                                                                                                                        | 2/32 [08:45<2:00:53, 241.79s/it]





  9%|███████████████▎                                                                                                                                                   | 3/32 [10:49<1:30:51, 187.99s/it]





 12%|████████████████████▍                                                                                                                                              | 4/32 [13:52<1:26:51, 186.13s/it]





 16%|█████████████████████████▍                                                                                                                                         | 5/32 [16:37<1:20:18, 178.47s/it]



 19%|██████████████████████████████▌                                                                                                                                    | 6/32 [18:19<1:06:03, 152.45s/it]







 22%|███████████████████████████████████▋                                                                                                                               | 7/32 [22:34<1:17:34, 186.17s/it]







 25%|████████████████████████████████████████▊                                                                                                                          | 8/32 [27:15<1:26:27, 216.14s/it]







 28%|█████████████████████████████████████████████▊                                                                                                                     | 9/32 [31:10<1:25:07, 222.06s/it]





 31%|██████████████████████████████████████████████████▋                                                                                                               | 10/32 [34:18<1:17:35, 211.60s/it]





 34%|███████████████████████████████████████████████████████▋                                                                                                          | 11/32 [37:17<1:10:33, 201.57s/it]







 38%|████████████████████████████████████████████████████████████▊                                                                                                     | 12/32 [42:26<1:18:06, 234.31s/it]





 41%|█████████████████████████████████████████████████████████████████▊                                                                                                | 13/32 [44:39<1:04:32, 203.81s/it]





 44%|██████████████████████████████████████████████████████████████████████▉                                                                                           | 14/32 [48:36<1:04:04, 213.56s/it]















 47%|███████████████████████████████████████████████████████████████████████████▉                                                                                      | 15/32 [59:34<1:38:28, 347.54s/it]





 50%|████████████████████████████████████████████████████████████████████████████████                                                                                | 16/32 [1:02:31<1:18:59, 296.23s/it]







 53%|█████████████████████████████████████████████████████████████████████████████████████                                                                           | 17/32 [1:07:14<1:13:05, 292.37s/it]



 56%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 18/32 [1:09:06<55:33, 238.07s/it]





 59%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                 | 19/32 [1:12:03<47:38, 219.90s/it]





 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                            | 20/32 [1:16:15<45:53, 229.46s/it]







 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 21/32 [1:20:35<43:43, 238.54s/it]



 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 22/32 [1:20:57<28:54, 173.49s/it]





 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 23/32 [1:24:44<28:27, 189.70s/it]





 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 24/32 [1:27:33<24:27, 183.48s/it]











 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 25/32 [1:34:44<30:05, 257.88s/it]





 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 26/32 [1:36:55<21:57, 219.65s/it]



 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 27/32 [1:38:27<15:07, 181.47s/it]









 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 28/32 [1:45:05<16:25, 246.34s/it]





 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 29/32 [1:47:14<10:33, 211.03s/it]







 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 30/32 [1:52:12<07:54, 237.26s/it]





 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 31/32 [1:55:20<03:42, 222.58s/it]





100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [1:58:22<00:00, 221.94s/it]


In [155]:
with open("image_features_full_dictionary.json", "w") as outfile: #Guardar el diccionario para posterior procesamiento
    json.dump(images_dict, outfile)

# Text Feature Extraction

In [6]:
def clean_name(text_name):
    clean = []
    for t in text_name:
        clean_text = regex.sub(r'\&[a-z]+\;', '', t)
        clean_text = regex.sub(r"\n", " ", t)
        clean_text = t.lower()
        clean_text = unidecode.unidecode(clean_text)
        clean_text = regex.sub(r'\s+', ' ', clean_text)
        clean_text = regex.sub('[^a-zA-Z]', ' ', clean_text).strip(' ')
        clean.append(clean_text)
    return clean

In [7]:
df_subcat['marca_clean'] = clean_name(list(df_subcat['marca']))

In [8]:
# Importar modelo tf-idf Y SOLO HACER TRANSFORM

In [9]:
df_subcat['Name feature'] = list(tfidf_vectorizer_cat.transform(df_subcat['nombre_preproc']).toarray())
df_subcat['Brand feature'] = list(tfidf_vectorizer_cat.transform(df_subcat['marca_clean']).toarray())

# Image feature

In [10]:
def extract_image_index(category):
    ind_images = []
    for im in list(IMAGES_DICT[category].keys()):
        ind_images.append(int(im.split(category)[1].split('.')[0]))
    return ind_images

In [11]:
def extract_image_feat(cat,pos_ind): 
    img_keys = list(IMAGES_DICT[cat].keys())
    img_feat = list(IMAGES_DICT[cat][img_keys[pos_ind]])
    img_feat = list(img_feat[0])
    return img_feat

# Label Group ID

In [14]:
def create_label_group(df_filt): 
    x = list(df_filt['numeric_cat'])[0]
    y = list(df_filt['numeric_sub_cat'])[0]
    label_group = str(str(x) + str(y))
    return label_group

# Product structure

* Integrate text feature vector with image vectos 
* this will be the input for model similarity
* Labelgroup is the union of category id and sub_category id

In [15]:
image_dir = Path(CWD+'/image_features_full_dictionary.json')
f = open(image_dir)
IMAGES_DICT = json.load(f)

In [20]:
def return_df_feats(category, df_filt):
    image_index = extract_image_index(category)
    nombre = []
    name_feat = []
    precio_val = []
    image_feature = []
    label_group = []
    brand_feature = []
    cat = []
    subcat = []
    for i,ind_ in enumerate(image_index): 
        nombre.append(df_filt.reset_index().iloc[[ind_]]['nombre_preproc'])
        name_feat.append(list(df_filt.reset_index().iloc[[ind_]]['Name feature'])[0])
        brand_feature.append(list(df_filt.reset_index().iloc[[ind_]]['Brand feature'])[0])
        precio_val.append(list(df_filt.reset_index().iloc()[[ind_]]['precio'])[0])
        image_feature.append(extract_image_feat(category,i))
        cat.append(list(df_filt.reset_index().iloc[[ind_]]['numeric_cat'])[0])
        subcat.append(list(df_filt.reset_index().iloc[[ind_]]['numeric_sub_cat'])[0])
        label_group.append(str(create_label_group(df_filt.reset_index().iloc[[ind_]])))
    df_prod = pd.DataFrame(list(zip(nombre,cat,subcat,name_feat,brand_feature,precio_val,image_feature,label_group)),
                           columns=['name','categoria','subcategoria','name_vector','brand_vector','price','image_feature','labelgroup'])
    return df_prod

In [23]:
list_dfs =[]
for cat in tqdm(df_subcat['categoria'].unique()):
    df_filt = df_subcat[df_subcat['categoria']== cat]
    df_out = return_df_feats(cat,df_filt)
    list_dfs.append(df_out)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [02:23<00:00,  4.48s/it]


In [24]:
df_products = pd.concat(list_dfs)

In [25]:
df_products['url_imagen'] = df_subcat['imagen']
df_products['labelgroup'] = df_products['labelgroup'].astype(str)

## Aproximación para similitud entre productos: cálculo de centroides

In [27]:
df_products['centroid_name'] = [np.mean(n) for n in df_products['name_vector']]
df_products['centroid_brand'] = [np.mean(n) for n in df_products['brand_vector']]
df_products['centroid_image'] = [np.mean(n) for n in df_products['image_feature']]

In [28]:
# Guardar datafrmae con toda la información por producto 
cwd = os.getcwd()
filepath = Path(cwd+'/dataset_products_info.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_products.to_csv(filepath,index=False) 

In [47]:
df_products

Unnamed: 0,name,name_vector,brand_vector,price,image_feature,labelgroup,url_imagen
0,48 kit asientos auto cuero eco cubre volant...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",13999.00,"[0.0792483389377594, 0.008941839449107647, 0.1...",03,http://http2.mlstatic.com/D_825151-MLA44504428...
1,833 kit combo elementos seguridad nautica e...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",17787.00,"[0.07415435463190079, 0.39244845509529114, 0.0...",0235,http://http2.mlstatic.com/D_980071-MLA49849692...
2,74 juego tazas rodado chevrolet agile ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6059.99,"[0.25299349427223206, 0.1530793011188507, 9.23...",03,http://http2.mlstatic.com/D_617422-MLA47397731...
3,199 kit asientos auto cuero eco cubre volan...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",13999.00,"[0.0792483389377594, 0.008941839449107647, 0.1...",0339,http://http2.mlstatic.com/D_918127-MLA50207700...
4,827 orejera lavamotor fuera de borda mm ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1150.00,"[0.3235769271850586, 0.00045812607277184725, 0...",0235,http://http2.mlstatic.com/D_641922-MLA32696446...
...,...,...,...,...,...,...,...
497,439 acido muriatico x litros puro al Nam...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2590.00,"[0.0018091266974806786, 0.2593787908554077, 0....",2716,http://http2.mlstatic.com/D_675585-MLA50373688...
498,363 plumon de marabu para revelado de huell...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3850.00,"[0.0, 0.0, 0.0, 0.06780879944562912, 0.0, 0.28...",27188,http://http2.mlstatic.com/D_830144-MLA43429026...
499,405 licencia de taxi nunca alquilada ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",49000.00,"[0.14838288724422455, 0.08391854912042618, 0.0...",27198,http://http2.mlstatic.com/D_634806-MLA51509026...
500,411 licencia de taxi lista para transe...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",43000.00,"[0.21066132187843323, 0.22373764216899872, 0.0...",27198,http://http2.mlstatic.com/D_718693-MLA45091829...


In [105]:
df_products_feats = df_products[['centroid_name','centroid_brand','price','centroid_image','labelgroup']]

In [107]:
# Guardar datafrmae con todas la features de los productos
cwd = os.getcwd()
filepath = Path(cwd+'/dataset_products_feats.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_products_feats.to_csv(filepath,index=False) 