In [1]:
import json
import os
import tqdm
import time

import pandas as pd
import numpy as np
from PIL import Image
from io import BytesIO
import requests

In [2]:
## Config
LOAD_JSON = False
LOAD_PICKLE = True
UPDATE = False

In [3]:
def load_data_from_json():
    base_dir = 'DATA/digikala_products/all_data/'
    data = []
    for i in tqdm.tqdm(os.listdir(base_dir)):
        path = base_dir+i
        data += json.load(open(path, encoding="utf8"))
    print(f'# of data loaded = {len(data)}')
    
    data_dict = {}
    for dic in tqdm.tqdm(data):
        if 'recommendations' not in dic['data'].keys():
            k = dic['data']['product']['id']
            data_dict[k] = dic
    print(f'# of active data loaded ={len(data_dict.keys())}')
    
    return data_dict

def pickle_data(data):
    path = 'DATA/pickled/'
    vers = os.listdir(path)
    last_ver = float(vers[-1].split('_')[-1].split('.')[0])
    new_ver = last_ver + 0.1
    
    np.save(f'DATA/pickled/data_{new_ver}.npy', data)
    print(f'pickled at DATA/pickled/data_{new_ver}.npy')
    
def load_pickle():
    path = 'DATA/pickled/'
    vers = os.listdir(path)
    last_ver = vers[-1]
    
    return np.load(path+last_ver, allow_pickle=True).item()

def get_request(i, use_recom=False):
    #time.sleep(1)
    ok = False
    while ok == False:
        ok = True
        try: r = requests.get(f'https://api.digikala.com/v2/product/{str(i)}/')
        except:
            print('network interupted: 60s')
            ok = False
            time.sleep(60)
            continue
            
        c = r.json()
        status = c['status']
        if r.ok:
            if 'recommendations' not in c['data'].keys():
                return c
#             else:
#                 # might have bugs
#                 if use_recom:
#                     try: c = get_request(c['data']['recommendations']['related_products']['products'][0]['id'])
#                     except:
#                         print(f'no related products found for id:{i}')
#                     return c
#                 else:
#                     return c

        else:
            print(f'err: {status} for id:{i} trying again!! in 60s')
            time.sleep(60)
            c = get_request(i)
            
def get_image(url):
    ok = False
    while ok == False:
        ok = True
        try: 
            r = requests.get(url)
        except:
            print('network interupted: 60s')
            ok = False
            time.sleep(60)
            continue
        if r.ok:
            c = r.content
        else:
            print(f'err: bad status for id:{i} trying again!! in 60s')
            time.sleep(60)
            c = get_image(url)
    return c
            
def clean_data(data):
    cleaned = {}
    for k in tqdm.tqdm(data.keys()):
        cleaned[k] = {}
        # id===================================
        try:cleaned[k]['id'] = data[k]['data']['product']['id']
        except:cleaned[k]['id'] = None
        # title================================
        try:cleaned[k]['title'] = data[k]['data']['product']['title_fa']
        except:cleaned[k]['title'] = None
        # category=============================
        try:cleaned[k]['category'] = data[k]['data']['product']['category']['id']
        except:cleaned[k]['category'] = None
        # brand================================
        try:cleaned[k]['brand'] = data[k]['data']['product']['brand']['id']
        except:cleaned[k]['brand'] = None
        try:cleaned[k]['is_premium'] = data[k]['data']['product']['brand']['is_premium']
        except:cleaned[k]['is_premium'] = None
        try:cleaned[k]['is_miscellaneous'] = data[k]['data']['product']['brand']['is_miscellaneous']
        except:cleaned[k]['is_miscellaneous'] = None
        # rating================================
        try:cleaned[k]['r_rate'] = data[k]['data']['product']['rating']['rate']
        except:cleaned[k]['r_rate'] = None
        try:cleaned[k]['r_count'] = data[k]['data']['product']['rating']['count']
        except:cleaned[k]['r_count'] = None
        # properties============================
        try:cleaned[k]['is_fake'] = data[k]['data']['product']['properties']['is_fake']
        except:cleaned[k]['is_fake'] = None
        try:cleaned[k]['is_jet'] = data[k]['data']['product']['properties']['is_jet_eligible']
        except:cleaned[k]['is_jet'] = None
        try:cleaned[k]['is_med'] = data[k]['data']['product']['properties']['is_medical_supplement']
        except:cleaned[k]['is_med'] = None
#         # product_badges========================
#         try:
#             badges = data[k]['data']['product']['product_badges']
#             cleaned[k]['badges'] = ''
#             for b in badges:
#                 cleaned[k]['badges'] += str(b['id'])
#                 cleaned[k]['badges'] += ','
#         except: cleaned[k]['badges'] = None
#         # colors================================
#         try:
#             badges = data[k]['data']['product']['colors']
#             cleaned[k]['colors'] = ''
#             for b in badges:
#                 cleaned[k]['colors'] += str(b['id'])
#                 cleaned[k]['colors'] += ','
#         except: cleaned[k]['colors'] = None
        # size_guide============================
        try:cleaned[k]['has_size_guide'] = data[k]['data']['product']['size_guide'] != []
        except:cleaned[k]['has_size_guide'] = None
        # price=================================
        try:cleaned[k]['price'] = data[k]['data']['intrack']['eventData']['unitPrice']
        except:cleaned[k]['price'] = None
        # image================================
        try:
            Image.open(f'DATA/Images/{k}.png')
        except:
            url = data[k]['data']['seo']['twitter_card']['image']
            img = get_image(url)
            with open(f'DATA/Images/{k}.png', 'wb') as handler:
                handler.write(img)
        
        
        
        
        
        
        
        
        
        
        
        
#         if np.asarray(Image.open(f'DATA/Images/{k}.png')).sum() == 0:
#             print(f'getting_image: {k}')
#             try:
#                 url = data[k]['data']['seo']['twitter_card']['image']
#                 img = get_image(url)
# #                 img.save(f'DATA/Images/missed/{k}.png')
#                 with open(f'DATA/Images/missed/{k}.png', 'wb') as handler:
#                         handler.write(img)
#             except:
#                 img = Image.new('RGB', (800, 800))
#                 img.save(f'DATA/Images/missed/{k}.png')
    return cleaned

In [4]:
get_request(2110364)

In [5]:
if LOAD_JSON:
    data_dict = load_data_from_json()
elif LOAD_PICKLE:
    data_dict = load_pickle()

In [6]:
df_train = pd.read_csv('DATA/train.csv')
df_nominated = pd.read_csv('DATA/nominated_p_ids.csv',index_col=0)
df_test = pd.read_csv('DATA/test_ids.csv')
unique_ids = pd.concat([df_train.source_product_id, df_train.rel_product_id, df_nominated.rel_product_id, df_test.source_product_id]).drop_duplicates()


In [7]:
if UPDATE:
    for k in tqdm.tqdm(unique_ids.values):
        if k not in data_dict.keys():
            r = get_request(k)
            if r is not None:
                print(f'adding id:{k}')
                data_dict[k] = r
    pickle_data(data_dict)
    

In [8]:
len(data_dict.keys())

70206

In [9]:
cleaned_data = clean_data(data_dict)

 16%|███████████▉                                                              | 11348/70206 [42:34<2:45:45,  5.92it/s]

network interupted: 60s


 35%|█████████████████████████▏                                              | 24551/70206 [1:34:54<2:23:13,  5.31it/s]

network interupted: 60s


 53%|██████████████████████████████████████▎                                 | 37297/70206 [2:28:04<1:41:09,  5.42it/s]

network interupted: 60s


 53%|██████████████████████████████████████▎                                 | 37325/70206 [2:29:32<3:43:19,  2.45it/s]

network interupted: 60s


 72%|███████████████████████████████████████████████████▌                    | 50308/70206 [3:22:27<1:10:20,  4.71it/s]

network interupted: 60s


 76%|██████████████████████████████████████████████████████▋                 | 53378/70206 [3:36:12<1:02:24,  4.49it/s]

network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupted: 60s
network interupt

 80%|██████████████████████████████████████████████████████████▊               | 55850/70206 [5:07:47<43:55,  5.45it/s]

network interupted: 60s
network interupted: 60s


 94%|█████████████████████████████████████████████████████████████████████▊    | 66264/70206 [5:54:05<21:57,  2.99it/s]

network interupted: 60s


100%|██████████████████████████████████████████████████████████████████████████| 70206/70206 [6:11:35<00:00,  3.15it/s]


In [10]:
temp_l = []
for k in cleaned_data.keys():
    temp_l.append(cleaned_data[k].values())
    
df_id2feat = pd.DataFrame(temp_l,index=cleaned_data.keys(), columns=cleaned_data[350].keys())

In [11]:
df_id2feat.to_csv('DATA/id2feat3.csv')