# Obtención de datos

In [1]:
import pandas as pd
import numpy as np
import regex
import requests
import unidecode
import os, glob
from pathlib import Path  
import cv2
import plotly.express as px
import matplotlib.pyplot as plt 
from tqdm import tqdm
import shutil, sys
import urllib.request
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk import FreqDist
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adelaidazuluaga/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adelaidazuluaga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Constants

In [2]:
OFFSETS_LIST= list(range(0,50,50)) #550 datos


### Requried Installations

In [3]:
from sklearn import metrics


### Product categories extraction

In [10]:
cats = requests.get('https://api.mercadolibre.com/sites/MLA/categories')
prod_categories= cats.json()

### Obtain product sub categories

In [3]:
def obtain_sub_categories(general_category:str): 
    try:
        url = f'https://api.mercadolibre.com/sites/MLA/search?category={general_category}'
        request = requests.get(url)
        items = request.json()
        sub_categories_names = [i['name'] for i in items['available_filters'][0]['values']]
        sub_categories_ids = [i['id'] for i in items['available_filters'][0]['values']]
        df_sub_cat = pd.DataFrame(list(zip(sub_categories_ids, sub_categories_names)),columns=['id','Name'])
    except Exception as e:
        print (e.message, e.args)
    return df_sub_cat

### Iteratively download product information for each category

In [4]:
def obtain_json_items_iteratively(df_sub_cat,offset_list):
    offsets= list(range(0,50,50)) #50 ELEMENTOS POR SUBCATEGORIA
    results = []
    sub_cat_list = []
    offsets = offset_list
    sub_cat_id = list(df_sub_cat['id'])
    sub_cat_name = list(df_sub_cat['Name'])
    for c in sub_cat_id:
        url = f'https://api.mercadolibre.com/sites/MLA/search?category={c}&offset={offsets[0]}'
        request = requests.get(url)
        data = request.json()
        for element in data['results']:
            results.append(element)
            sub_cat_list.append(list(df_sub_cat[df_sub_cat['id']==c]['Name'])[0])

    return results,sub_cat_list

### Extract product brand 

In [5]:
def get_product_brand(items_cat):
    brands = []
    price = []
    for att in items_cat['attributes']:
        find_attr = att['id']
        if 'BRAND' in find_attr or 'Brand' in find_attr:
            brands.append(att['value_name'])
            break
    return brands

### Text cleaning (NLP preprocessing) 

In [6]:
def pre_proc_nlp(text_df):
    cleaned_df =[]
    for text in text_df:
        clean_text = regex.sub(r'\&[a-z]+\;', '', text)
        clean_text = regex.sub(r"\n", " ", text)
        clean_text = text.lower()
        clean_text = unidecode.unidecode(clean_text)
        clean_text = regex.sub(r'\s+', ' ', clean_text)
        clean_text = regex.sub('[^a-zA-Z]', ' ', clean_text).strip(' ')
        cleaned_df.append(clean_text)
    return cleaned_df

In [7]:
def clean_name(text_name):
    text = text_name
    clean_text = regex.sub(r'\&[a-z]+\;', '', text)
    clean_text = regex.sub(r"\n", " ", text)
    clean_text = text.lower()
    clean_text = unidecode.unidecode(clean_text)
    clean_text = regex.sub(r'\s+', ' ', clean_text)
    clean_text = regex.sub('[^a-zA-Z]', ' ', clean_text).strip(' ')
    cleaned_df = clean_text
    return cleaned_df

### Create dataframe for products information

In [8]:
def create_pd_product(item_json,item_category, item_sub_cat):
    titles_=[]
    marcas_=[]
    image_=[]
    price_=[]
    sub_cat_ = []
    items = item_json
    for i in range(len(items)):
        titles_.append(items[i]['title'].lower()) 
        marcas_.append(get_product_brand(items[i])) #extraer la marca
        image_.append(items[i]['thumbnail'])
        price_.append(items[i]['price'])
        sub_cat_.append(clean_name(item_sub_cat[i]))
    df_products = pd.DataFrame(columns=['nombre','categoria','sub_categoria','marca','precio','imagen','nombre_preproc'])
    df_products['nombre'] = titles_
    df_products['categoria'] = clean_name(item_category)
    #df_products['num_categ'] = dict_categs[item_category]*len(items)
    df_products['sub_categoria'] = sub_cat_
    df_products['marca'] = marcas_
    df_products['precio'] = price_
    df_products['imagen'] = image_
    df_products['nombre_preproc'] = pre_proc_nlp(df_products['nombre'])
    return df_products

In [10]:
#category_dict_train = {}
#category_dict_test = {}
#category_dict_val = {}
#for category in tqdm(prod_categories): 
    #print(category)
    #obtener diccionario de elementos para train, test y val
    #df_cat = obtain_sub_categories(category['id'])
    #item_json_train,item_sub_category_train = obtain_json_items_iteratively(df_cat,OFFSETS_TRAIN) #train
    #item_json_test,item_sub_category_test = obtain_json_items_iteratively(df_cat,OFFSETS_TEST) #test
    #item_json_val,item_sub_category_val = obtain_json_items_iteratively(df_cat,OFFSETS_VAL) #test
    #cat_name = category['name']
    #train dataset 
    #df_cat_prod_train = create_pd_product(item_json_train,cat_name,item_sub_category_train)
    #category_dict_train[df_cat_prod_train['categoria'][0]] = df_cat_prod_train
    #test dataset
    #df_cat_prod_test = create_pd_product(item_json_test,cat_name, item_sub_category_test)
    #category_dict_test[df_cat_prod_test['categoria'][0]] = df_cat_prod_test
    #val dataset
    #df_cat_prod_val = create_pd_product(item_json_val,cat_name, item_sub_category_val)
    #category_dict_val[df_cat_prod_val['categoria'][0]] = df_cat_prod_val

## Generate category products dataframe

In [11]:
category_dict = {}
for category in tqdm(prod_categories): 
    df_cat = obtain_sub_categories(category['id'])
    item_json,item_sub_category = obtain_json_items_iteratively(df_cat,OFFSETS_LIST) #train
    cat_name = category['name']
    df_cat_prod = create_pd_product(item_json,cat_name,item_sub_category)
    category_dict[df_cat_prod['categoria'][0]] = df_cat_prod

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [04:58<00:00,  9.32s/it]


In [12]:
df_dataset = pd.concat([category_dict [cat] for cat in category_dict], axis=0)
df_cats_sub_cats = pd.DataFrame(df_dataset[['sub_categoria','categoria']].value_counts()).reset_index()

In [20]:
df_cats_sub_cats

Unnamed: 0,sub_categoria,categoria,0
0,aberturas,construccion,50
1,otros,belleza y cuidado personal,50
2,otros,herramientas,50
3,otros,electronica audio y video,50
4,otros,electrodomesticos y aires ac,50
...,...,...,...
383,acido clorhidrico,otras categorias,33
384,eventos deportivos,entradas para eventos,29
385,coberturas extendidas,otras categorias,27
386,eventos a beneficio,entradas para eventos,15


In [19]:
df_sub_registers = pd.DataFrame(df_dataset['sub_categoria'].value_counts()).reset_index()

In [25]:
df_sub_registers.describe()

Unnamed: 0,sub_categoria
count,352.0
mean,54.767045
std,72.507921
min,2.0
25%,50.0
50%,50.0
75%,50.0
max,1400.0


* Se decide tomar únicamenteo 50 registros por subcategoria, debido a la distribución de registros en cada subcategoria

### Generar One hot encoding para cada sub categoría

In [13]:
def encoding_categs(categories):
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(categories)
    return integer_encoded

In [14]:
numeric_sub_categories = encoding_categs(df_dataset['sub_categoria'].unique())
dict_sub_categs = dict(zip(df_dataset['sub_categoria'].unique(),numeric_sub_categories))
num_list = []
for cat_in in df_dataset['sub_categoria']: 
    num_list.append(dict_sub_categs[cat_in])
df_dataset['numeric_sub_cat'] = num_list

In [15]:
df_dataset['numeric_sub_cat'].nunique()

351

In [None]:
# 350 subcategorías 

### Guardar dataset completo

In [16]:
cwd = os.getcwd()
filepath = Path(cwd+'/dataset_sub_cats.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_dataset.to_csv(filepath,index=False) 

## Download images 

In [18]:
def download_images(image_route, category,name_folder):
    for ind,img_url in enumerate(image_route):
        url = img_url
        file_name = category + str(ind) +'.jpg' 
        save_path = './sub_images_'+name_folder+'/'+ category
        # Check whether the specified path exists or not
        if ind==0:
          # Create a new directory because it does not exist 
          os.makedirs(save_path)
        completeName = os.path.join(save_path, file_name)
        try:
            urllib.request.urlretrieve(url, completeName)
        except: 
            print(ind,category)
            print(url)
            print("problema con url")

In [19]:
for c_ in tqdm(df_dataset['categoria'].unique()):
    download_images(df_dataset[df_dataset['categoria']==c_]['imagen'],c_,'_')

 53%|███████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 17/32 [05:52<06:27, 25.85s/it]

144 entradas para eventos

problema con url
184 entradas para eventos

problema con url
205 entradas para eventos

problema con url


 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 31/32 [09:52<00:19, 19.49s/it]

402 otras categorias

problema con url


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [10:07<00:00, 18.98s/it]


### Dividir el dataframe en train, test y val

* Total de registros: 15,380
* Entrenamiento: 15,380 = 80%
* Prueba: 1,923 = 10%
* Validación: 1,923= 10%

In [45]:
df_train, df_val, df_test = np.split(df_dataset.sample(frac=1, random_state=42),[int(.8*len(df_dataset)), int(.9*len(df_dataset))])
print(df_train.shape, df_test.shape, df_validate.shape)

(15380, 8) (1923, 8) (1922, 8)
