In [4]:


from requests_html import AsyncHTMLSession
import requests
import concurrent
import asyncio
import random
from bs4 import BeautifulSoup
import pandas as pd
import json
import csv


In [5]:
#objects
class VgItem:
    def __init__(self):
        self.id =''
        self.name =''
        self.brand = ''
        self.price = ''
        self.arName=''
        self.arBrand=''
        self.lowResImgUrl = ''
        self.mediumResImgUrl=''
        self.highResImgUrl=''
        self.arCategory=''
        self.detailsLink =''

    def parseDataToItem(self,data,dtLink):
        try:
            jData = json.loads(data)
            self.id = jData['id'].strip()
            self.name = jData['name'].strip()
            self.brand = jData['brand'].strip()
            self.price = jData['price'].strip()
            self.detailsLink = dtLink 
            return True
        except ValueError:
            print('one item failed to be parsed ')
            return False
            
#helpers
def extractItemsFromPage(page):
    items_list = page.html.find('a.js-gtmProdData')
    items_details = []
    for item in items_list:
        vgItem = VgItem()
        parsed =vgItem.parseDataToItem(item.attrs['data-gtm-prod-data'],item.absolute_links.pop())
        if(parsed):
            items_details.append(vgItem)
    return items_details

def SliceItems(items):
    startIndex = 0
    sliceSize = 190
    slicedItems = []
    slicing = True
    while(slicing):
        if(startIndex+sliceSize <= len(items)):
            slicedItems.append(items[startIndex:startIndex+sliceSize])
            startIndex = startIndex+sliceSize
        else:
            slicedItems.append(items[startIndex::])
            slicing = False

    return slicedItems

    
def stringIdsIlist(vgItems):
    itemsIds = []
    for item in vgItems:
        cleanId = item.id.strip()
        
        itemsIds.append(int(cleanId))
    return itemsIds

def constructAPIUrl(baseUrl,ids):
    apiUrl=baseUrl
    apiUrl = apiUrl + "?ids=["
    for cId in ids:
        apiUrl = apiUrl + str(cId) +','
    lastChar = len(apiUrl)
    apiUrl = apiUrl +"]"
    return apiUrl


In [6]:
def serverHorder(idList):
    ll=  constructAPIUrl('https://www.carrefouruae.com/api/v1/solr/products',idList)
    fullEnData = requests.get(ll, headers= {'appId':"Reactweb",'storeId':"mafegy"})
    enJsonData = fullEnData.json()
    arLink = ll + "&lang=ar"
    print(arLink)
    fullArData =  requests.get(arLink, headers= {'appId':"Reactweb",'storeId':"mafegy"})
    arJsonData = fullArData.json()
    return [enJsonData,arJsonData]
  

def enrichItems(enJsonData,arJsonData,items,lastIndex):
    i = 0
    while i < len(enJsonData['data']):
        try:
            items[lastIndex].lowResImgUrl = enJsonData['data'][i]['images'][0]['url']
            items[lastIndex].mediumResImgUrl = enJsonData['data'][i]['images'][2]['url']
            items[lastIndex].highResImgUrl = enJsonData['data'][i]['images'][1]['url']
            items[lastIndex].arCategory = enJsonData['data'][i]['productCategoriesHearchi'].strip()
            items[lastIndex].arName = arJsonData['data'][i]['name'].strip()
            items[lastIndex].arBrand = arJsonData['data'][i]['brandName'].strip()
            lastIndex+=1
            i+=1
        except KeyError:
            print(f'item {items[lastIndex].name} has missing key ')
            lastIndex+=1
            i+=1

def writeCsv(categoryName,items):
    with open(f'{categoryName}.csv', mode='w') as data_file:
        data_writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

        data_writer.writerow(['Name', 'Brand', 'Price' ,'Ar Name','Ar Brand','Arabic Category','HighResImg','MediumResImg','LowResImg'])
        for vgItem in items:
            print(f"en NAme {vgItem.name} ar name {vgItem.arName}")
            data_writer.writerow([f'{vgItem.name}', f'{vgItem.brand}', f'{vgItem.price}', f'{vgItem.arName}' ,f'{vgItem.arBrand}' ,f'{vgItem.arCategory}' ,
                                  f'{vgItem.highResImgUrl}', f'{vgItem.mediumResImgUrl}', f'{vgItem.lowResImgUrl}'])
    

In [7]:

# Main scrapping fun 
async def main():
    #executor = concurrent.futures.ThreadPoolExecutor(max_workers=6)
    base_urls = ['https://www.carrefouregypt.com/mafegy/en/bakery/c/FEGY1610000','https://www.carrefouregypt.com/mafegy/en/frozen-food/c/FEGY6000000',
                 'https://www.carrefouregypt.com/mafegy/en/beverages/c/FEGY1500000','https://www.carrefouregypt.com/mafegy/en/cleaning-household/c/NFEGY3000000',
                 'https://www.carrefouregypt.com/mafegy/en/fresh-food/c/FEGY1600000']
    category = ['bakery','frozen-food','beverages','cleaning-household','fresh-food']
    asession = AsyncHTMLSession()
    crawling = True
    page_url = base_urls[4]
    scrappedData = []
    vgItems = []
    header = {"appId":"Reactweb",
             "storeId": "mafegy"}
    while(crawling):
        page = await asession.get(page_url)
        items = extractItemsFromPage(page)
        scrappedData = [*scrappedData,*items]     
        link = page.html.find('a.plp-pagination__navnext',first=True)
                 
        if(link.absolute_links!= set()):
            nextPageUrl = link.absolute_links.pop() 
            page_url = nextPageUrl
            print(nextPageUrl)
        else:
            crawling = False
        

        
        print("---------------")
    print (f"scrapped data size = {len(scrappedData)}")
    slicedItems = SliceItems(scrappedData)
    

    ii =0 
    print (f"number of item slices = {len(slicedItems)}")
    lastIndex =0
    while ii < len(slicedItems):
        print(f'ii = {ii}')
        idList = stringIdsIlist(slicedItems[ii])
        itemInfo =  serverHorder(idList)
        enrichItems(itemInfo[0],itemInfo[1],slicedItems[ii],lastIndex)
        ii+=1
        
    spredList = []
    for items in slicedItems:
        spredList = [*spredList,*items]



    writeCsv(category[4],spredList)
    return
        
    tasks = []
    
    for itemsSlice in slicedItems :
        tasks.append(asyncio.create_task(highResImageLinker(itemsSlice,executor)))
        print('tasking')
    for task in tasks:
         vgItems = [*vgItems, *await task]
         print("appending created items")
    print("done")



In [8]:
await main()


https://www.carrefouregypt.com/mafegy/en/fresh-food/c/FEGY1600000?&qsort=relevance&pg=1
---------------
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
https://www.carrefouregypt.com/mafegy/en/fresh-food/c/FEGY1600000?&qsort=relevance&pg=2
---------------
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
one item failed to be parsed 
https://www.carrefouregypt.com/mafegy/en/fresh-food/c/FEGY1600000?&qsort=relevance&pg=3
---------------
https://www.carrefouregypt.com/mafegy/en/fresh-food/c/FEGY1600000?&qsort=relevance&pg=4
---------------
https://www.carrefouregypt.com/mafegy/en/fresh-food/c/FEGY1600000?&qsort=r

In [83]:
scrappedData[0].imgUrl

NameError: name 'scrappedData' is not defined

In [None]:
#old fun
def extractItemsFromPage(page):
    items_list = page.html.find('a.js-gtmProdData')
    items_details = []
    for item in items_list:
        items_details.append([VgItem(item.attrs['data-gtm-prod-data']),item.absolute_links.pop()])
    return items_details

def highResImageLinker(itemsWlinks):
    items = []
    for [vgItem,detailsLink] in itemsWlinks:
        detailed_page = requests.get(detailsLink)
        executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
        loop = asyncio.get_event_loop()
        vgItem.imgUrl = loop.run_in_executor(executor, loadImg, detailed_page)
        items.append(vgItem)
        print(vgItem.imgUrl)
    return items
def loadImg(page ):
    htmlPage = BeautifulSoup(page.text,"html.parser") 
    imgTag = htmlPage.find("div","productinfo-slider")
    return imgTag.img.attrs['data-lazy'] 

In [None]:

    with open('frozen-food.html','w') as file:
    file.write(page)

In [None]:
items_list = page.html.find('a.js-gtmProdData')
items_details = []
for item in items_list:
    print(item.absolute_links.pop())
    print("----------------")
    items_details.append([item.absolute_links.pop,item.attrs['data-gtm-prod-data']])

items=[]
for [baseLink,item] in items_details:
    vgItem = VgItem(item)
    items.append([vgItem,baseLink])

In [None]:
        

async def highResImageLinker(itemsWithDetailsLink,executor):
    minDelay = 0.200
    maxDelay =0.500
    loop = asyncio.get_event_loop()
    imageASession = AsyncHTMLSession()
    for [vgItem , detailsLink] in itemsWithDetailsLink:
        randomDelay = random.randint(2,5)*0.1
        await asyncio.sleep(randomDelay)
        detailed_page = imageASession.get(detailsLink)
        yield loop.run_in_executor(executor, loadImg, detailed_page , vgItem,items)
      
    
def loadImg(page, item ):
    print("loadingImage")
    imgTag = page.html.find("div.productinfo-slider img",first = True)
    item.imgUrl =imgTag.attrs['data-lazy'] 