In [1]:
import asyncio
import re
import time
import pandas as pd
import numpy as np
from playwright.async_api import async_playwright
from sklearn.feature_extraction.text import TfidfVectorizer
import aiohttp        
import aiofiles
import time
from tqdm.notebook import tqdm

SELECTING WORDS

In [2]:
base_path = '../data'
# base_path = 'data'
train_df = pd.read_csv(base_path + '/train.csv')


In [3]:
groups = train_df.groupby(['label_group']).filter(lambda x: x['title'].size < 6)
labels = groups['label_group'].unique().tolist()


In [4]:
groups['title'] = [ s.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf-8') for s in groups['title']]
groups['title'] = groups.title.str.lower()
groups['title'] = [re.sub('[^0-9a-zA-Z ,.]+', ' ', x) for x in groups['title']]

In [5]:
# groups['len_words'] = [len(re.sub('[^0-9a-zA-Z ,.]+', ' ', x).split()) for x in groups['title']]

In [5]:
grouped = groups.groupby('label_group')

In [6]:
products = []
for id in labels:
    product = {}
    curr = grouped.get_group(id)
    product['label_group'] = id
    base = r'^{}'
    expr = '(?=.*{})'
    v = TfidfVectorizer()
    x = v.fit_transform(curr['title'])
    y = v.transform([' '.join(curr["title"])])
    feature_array = v.get_feature_names_out()
    if feature_array.shape[0] > 5:
        tfidf_sorting = np.argsort(y.toarray()).flatten()[::-1]
        top_n = feature_array[tfidf_sorting][:5].tolist()
        product['query'] = ('https://shopee.co.id/search?keyword=' + " ".join(top_n)).replace(" ", "%20")
        
        product['words'] = base.format(''.join(expr.format(w) for w in top_n))
    else:
        product['query'] = ('https://shopee.co.id/search?keyword=' + curr['title'].iloc[0] ).replace(" ", "%20") 
       
        product['words'] =  base.format(''.join(expr.format(w) for w in feature_array.tolist()))
    products.append(product)

SCRAPING

In [7]:
async def download_img(url,title,label_group):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            if resp.status == 200:
                # start = time.time()
                f = await aiofiles.open(title, mode='wb+')
                await f.write(await resp.read())
                await f.close()
                # end = time.time()
                # print(f"{title} at {label_group} downloaded for {end-start} secs")
                return True
            else:
                return False


async def get_info(item,label_group,words):
    res = dict()
    title = await item.get_attribute('alt')
    matched = (re.sub('[^0-9a-zA-Z ,]+', ' ', title)).lower()
    if re.match(words, matched):
        link = await item.get_attribute('src')
        url = link[:-3]
        name = re.sub('[^0-9a-zA-Z ,]+', ' ', title)
        name = " ".join(name.split()[:6])
        name = base_path + '/scrap_image/' + name + "_" + str(np.random.randint(0, 60)) + "_" + str(label_group)  + '.jpg'
        name = name.replace(' ', "_")
        down = await download_img(url,name,label_group)
        if down:
            res['url'] = url
            res['image'] = name
            res['title'] = title
            res['label_group'] = label_group
            return res
        else:
            res['url'] = url
            res['image'] = name
            res['title'] = title
            res['label_group'] = label_group
            return (res)

async def scrape_category(product,timeout=30000):
    async with async_playwright() as pw:
        # start = time.time()
        browser = await pw.chromium.launch( headless=True)
        page = await browser.new_page()
        await page.goto(product['query'])
        # start_wait = time.time()
        res = await page.wait_for_selector('.shopee-search-item-result', timeout=timeout)
        end_wait = time.time()
        # print(f"product {product['label_group']} waiting page for {end_wait-start_wait} secs")
        for i in range(3):
            await page.wait_for_selector('._7DTxhh.vc8g9F')
            await page.keyboard.down('End')
        selector = '._7DTxhh.vc8g9F'
        all_items = await res.query_selector_all(selector)
        tasks = []
        for item in all_items:
            tasks.append(asyncio.create_task(get_info(item,product['label_group'],product['words'])))
        results = await asyncio.gather(*tasks)
        await browser.close()
        # end = time.time()
        # print(f"product {product['label_group']} got {len(results)} items for {end-start} secs")
        # print(f'Download time: {end - end_wait}, { (end - end_wait) / len(results)} per items')
        return results

async def scraper(products, timeout=30000):
    tasks = []
    for product in products:
        tasks.append(asyncio.create_task(scrape_category(product,timeout=timeout)))
    result = await asyncio.gather(*tasks)
    return result

In [8]:
# async def test_scrape(threads, timeout=30000):
#     start = time.time()
#     result = []
#     for i in range(0,21,threads):
#         print(f"CURR is {i}")
#         if i + threads < 21 -1:
#             res= await scraper(products[i:i+threads],timeout=timeout)
#         else:
#             res= await scraper(products[i:], timeout=timeout)
#         result = result + res
#     end = time.time()
#     print(end - start)
#     print(len(result))
#     return result




In [8]:
# await test_scrape(2)

In [10]:
old_error_list = [58, 71, 103, 104, 116, 131, 156]

In [9]:
async def scrape_many_threads(products,result=[],start=0,end=10088,max_threads=2,max_timeout=60000):
    first_start = start
    prev_start = 0
    ok_count = 0
    error_list = []
    error_count = 0
    threads = min_threads = 1
    timeout = min_timeout = 30000
    while start < end - 1:
        try:
            for i in tqdm(range(start,end,threads)):
                prev_start = start
                if i + threads < end -1:
                    res= await scraper(products[i:i+threads],timeout=timeout)
                else:
                    res= await scraper(products[i:], timeout=timeout)
                result = result +  [x for y in res for x in y if type(x) is dict]
                start = i + threads
                ok_count += 1
                if start % 2 == 0 or start % 3 == 0 or start % 5 == 0:
                    test = pd.DataFrame.from_dict(result)
                    test.to_csv(base_path + '/temp/scrap_temp_s_'+ str(first_start)+ "_" + str(start) + '.csv')
                if ok_count > 0 and ok_count % 25 == 0:
                    error_count = 0
                    timeout = min(max_timeout,timeout * 2)
                    threads = min(max_threads, threads * 2)
                    break
        except KeyboardInterrupt:
            test = pd.DataFrame.from_dict(result)
            test.to_csv(base_path + '/temp/scrap_temp_s_'+ str(first_start)+ "_" + str(start) + '.csv')
            print(start)
            print(error_list)
            return result, error_list
        except:
            if prev_start == start:
                ok_count = 0
                error_count += 1
            print(f'start = {start} threads = {threads}, timeout = {timeout}, error= {error_count}')
            if error_count >= 3 or (timeout == max_timeout and threads == min_threads):
                error_count = 0
                error_list.append(start)
                start = start + 1
                timeout = min_timeout
                threads = min_threads
                print(error_list)
                continue
            if  error_count >= 2 or timeout == max_timeout:
                threads = max (threads // 2, min_threads)
                continue
            if timeout < max_timeout:
                timeout = min(timeout * 1.5, max_timeout)
                continue

    test = pd.DataFrame.from_dict(result)
    test.to_csv(base_path + '/scrap.csv')
    return result, error_list

In [None]:
results, error_list = await scrape_many_threads(products,start=161,max_threads=12,max_timeout=60000)

  0%|          | 0/9927 [00:00<?, ?it/s]

start = 161 threads = 1, timeout = 30000, error= 1


  0%|          | 0/9927 [00:00<?, ?it/s]

In [10]:
# async def scrape_many(products,result=[],start=0,end=10088):
#     prev_start = 0
#     error_count = 0
#     error_list = []
#     def_timeout  = timeout = 30000
#     while start < end - 1:
#         try:
#             for i in tqdm(range(start,end)):
#                 prev_start = start
#                 res = await scrape_category(products[i],timeout)
#                 result = result +  [x for x in res if type(x) is dict ]
#                 start = i + 1
#                 if i % 10 == 0 or i % 3 == 0 :
#                     test = pd.DataFrame.from_dict(result)
#                     test.to_csv(base_path + '/temp/scrap_temp_s_' + str(start) + "_" + str(time.time()) + '.csv')
#         except KeyboardInterrupt :
#             test = pd.DataFrame.from_dict(result)
#             test.to_csv(base_path + '/temp/scrap_temp_s_' + str(start) + "_" + str(time.time()) + '.csv')
#             print(start)
#             print(error_list)
#             return result, error_list
#         except :
#             if prev_start == start:
#                 error_count += 1
#             if timeout < 60000:
#                 timeout = min(timeout * 1.5, 60000)
#             if error_count > 3 or timeout == 60000:
#                 error_count = 0
#                 error_list.append(start)
#                 start = start + 1
#                 timeout = def_timeout
#             continue
#     test = pd.DataFrame.from_dict(result)
#     test.to_csv(base_path + '/scrap.csv')
#     return result, error_list

In [None]:
# results, error_list = await scrape_many(products,start=272)

In [None]:
results, error_list = await scrape_many(products,start=272)

In [11]:
print(error_list)

NameError: name 'error_list' is not defined