In [25]:
import json
import re
from requests_html import AsyncHTMLSession
from utils import getStringBetweenTwoWords, extract_and_stringify_object , printJsonToFile,printSoupToHtml
from Product import ProductDetailDTO
#rewrite scrape_Jarir_full function with the object extraction 
def construct_arabic_name(data):
    # Extract the naming formula
    naming_formula = data.get("naming_formula", "")
    
    # Split the formula into parts
    parts = naming_formula.split("#,*")
    
    # Extract and join the Arabic name fields
    arabic_name = []
    for part in parts:
        fields = part.split("*")
        for field in fields:
            field = field.replace("#", "").strip()
            if field in data:
                arabic_name.append(data[field])
    
    # Join the parts to form the full Arabic name
    full_arabic_name = " ".join(arabic_name)
    
    return full_arabic_name

async def scrape_Jarir_full(url):  
    #Url Navigation if
    # ar-sa replace to en-sa 
    if "ar-sa" in url:
        url=url.replace("ar-sa", "en-sa")
    # if sa-en not included add them after the domain
    if "sa-en" not in url:
        url=url.replace("jarir.com", "jarir.com/sa-en")
    session = AsyncHTMLSession()
    response = await session.get(url)
    soup=response.content.decode()
    #printSoupToHtml(soup, "jarir.html")
    stringObj = getStringBetweenTwoWords(soup, 'original:', ',related').replace('!1', 'true').replace('!0', 'false')
    obj = json.loads(extract_and_stringify_object(stringObj))
    #load the json from the file jarir.json
    #obj = json.load(open("jarir.json"))    
    # Extracting the values from the specs string
    specs = getStringBetweenTwoWords(str(obj), 'homedelivery_enable', 'check_availability_status')
    specs_list = [spec.strip() for spec in specs.split(',') if ":" in spec]
    concatenated_specs = ', '.join(specs_list)
    # Extracting the values using regular expressions
    concatenated_specs = re.findall(r": '([^']*)'", concatenated_specs)
    concatenated_specs.insert(0, "1")  # Add the first value

    # Join the extracted values into a single string
    description = ', '.join(concatenated_specs)

    #printJsonToFile(obj, "jarir.json")
    name_Local = obj.get("name", "")
    name_global = obj.get("GTM_name", "")
    isAvailable = obj.get("check_availability_status", 0) == 1
    PriceAfterDiscount = obj.get("final_price_ex_tax", obj.get("price", None))
    rating = obj.get("starRatings", None)
    
    images = ["https://ak-asset.jarir.com/akeneo-prod/asset/"+image["image"] for image in obj.get("media_gallery", []) if image.get("type") == "image"]

    # Local Extraction..
    url=url.replace("sa-en/en-sa", "ar-sa")
    response = await session.get(url)
    soup = response.content.decode()
    stringObj = getStringBetweenTwoWords(soup, 'original:', ',related').replace('!1', 'true').replace('!0', 'false')
    obj = json.loads(extract_and_stringify_object(stringObj))
    # Extract the name and Description
    name_Local = obj.get("name", "")
    specs = getStringBetweenTwoWords(str(obj), 'homedelivery_enable', 'check_availability_status')
    specs_list = [spec.strip() for spec in specs.split(',') if ":" in spec]
    concatenated_specs = ', '.join(specs_list)
    # Extracting the values using regular expression
    concatenated_specs = re.findall(r": '([^']*)'", concatenated_specs)
    concatenated_specs.insert(0, "1")  # Add the first value
    concatenated_specs = ', '.join(concatenated_specs)

    description_Local = concatenated_specs
    item_data = ProductDetailDTO(description_Local=description_Local,name_Global=name_global,name_Local=name_Local, price=PriceAfterDiscount,images=images,productlink1=url,description_Global=description,is_available=isAvailable,rating=rating)
    return item_data
#Test
url = 'https://www.jarir.com/ar-sa/samsung-qa55s90dauxsa-smart-tv-632865.html'
x=await scrape_Jarir_full(url)
print("name_global:",x.name_Global)
print("name_local:",x.name_Local)
print("isAvailable:",x.is_available)    
print("price:",x.price)
print("rating:",x.rating)
print("description:",x.description_Global)
print("Description_Local:",x.description_Local)
print("images:",x.images)



https://www.jarir.com/sa-en/en-sa/samsung-qa55s90dauxsa-smart-tv-632865.html
name_global: Samsung 55" Smart TV, 4K OLED, OLED, Graphite Black, QA55S90DAUXSA
name_local: سامسونج 55 بوصة تلفزيون ذكي، 4‎K اوليد، او ال اي دي، أسود الجرافيت، دي أيه يو اكس أس أيه ‎90‎‎ اس ‎55‎‎ كيو ايه
isAvailable: True
price: 6433.913
rating: 5
description: 1, AV Cable/Sound Bar/Home Theater/Wireless/Screen Cleaning Kit/Cleaning Wipes, Black, NQ4 AI Gen2, Bluetooth/WiFi/Wireless, OLED, Graphite Black, 144 Hz, 4 Port HDMI, HDR+, 3840 X 2160p, Tizen 2.0, 100 - 240 Volts, Smart TV, 55", 4K OLED, Samsung Series 9, 2.1CH (40W), n/a, n/a
Description_Local: 1, AV Cable/Sound Bar/Home Theater/Wireless/Screen Cleaning Kit/Cleaning Wipes, أسود, الجيل الثاني من الذكاء الاصطناعي NQ4,  وايرلس/بلوتوث/واي فاي, او ال اي دي, أسود الجرافيت, 144 هرتز, 4 منافذ اتش دي ام آي, HDR+, 3840 X 2160p, تايزن 2.0, \u200e100\u200e \u200e-\u200e \u200e240\u200e فولت\u200e, \u200eتلفزيون ذكي\u200e, 55 بوصة, \u200e4\u200eK اوليد\u200e, 9 سا

In [2]:
# from requests_html import AsyncHTMLSession
# from utils import getStringBetweenTwoWords
# from Product import ProductDetailDTO

# async def scrape_Jarir_full(url):  
#     session = AsyncHTMLSession()
#     response = await session.get(url)
#     script_target_object=getStringBetweenTwoWords(response.content.decode(), 'window.__INITIAL_STATE__= ','</script>')
#     strat_index=response.content.decode().index('window.__INITIAL_STATE__')
#     end_index=response.content.decode().index('</script>',strat_index)
#     script_target_object=response.content.decode()[strat_index:end_index].replace('window.__INITIAL_STATE__=','')
#     price=getStringBetweenTwoWords(script_target_object, 'final_price_ex_tax:',',')
#     Title=getStringBetweenTwoWords(script_target_object, 'GTM_name:',',')
#     images_string = getStringBetweenTwoWords(script_target_object, 'media_gallery','tsk').split("},{")
#     images=[]
#     for i in range(len(images_string)):
#         images.append("https://ak-asset.jarir.com/akeneo-prod/asset/"+getStringBetweenTwoWords(images_string[i], 'image:"','",lab'))
#     item_data = ProductDetailDTO(name_Global=Title, price=price,images=images,productlink1=url)
#     return item_data

# #Test
# url = 'https://www.jarir.com/sa-en/default-category/msi-clae-a1m-gaming-consoles-and-handheld-631416.html'
# x=await scrape_Jarir_full(url)
# print(x.__dict__)
