In [None]:
import requests
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd

In [76]:
page_url = "https://www.carrefouregypt.com/mafegy/en/c/FEGY1720200?currentPage="
get_cvs_for_all_product_in_page(page_url)

Found 60 products
Processing product 1/60
✓ Done - Temmys Cereal Choco Pillow - 450 gm
Processing product 2/60
✓ Done - Kellogg&#39;s Plain Corn Flakes - 500 gram
Processing product 3/60
✓ Done - Nesquik Cereal Chocolate - 330 gm
Processing product 4/60
✓ Done - Temmy&#39;s Corn Flakes box - 500 grams
Processing product 5/60
✓ Done - Temmy&#39;s Choco Pops - 500 grams
Processing product 6/60
✓ Done - Temmy&#39;S Cereal Corn Flakes 1K
Processing product 7/60
✓ Done - Kelloggs Frosties - 470 gm
Processing product 8/60
✓ Done - Temmy&#39;s Choco Scoops Cereal box - 250 grams
Processing product 9/60
✓ Done - Fitness Honey and Almond Cereal - 355 Gram
Processing product 10/60
✓ Done - Nestle Fitness Chocolate Cereal Bar - 23.5 gram - 6 Pieces
Processing product 11/60
✓ Done - Temmy&#39;s Choco Pillow Cereal box - 300 gram
Processing product 12/60
✓ Done - Nestle Fitness Strawberry Breakfast Cereal Bar - 23.5 gram - 6 Pieces
Processing product 13/60
✓ Done - Temmy&#39;s Sweet Flakes - 500 gr

OSError: [Errno 22] Invalid argument: 'products.csv'

In [70]:
def get_cvs_for_all_product_in_page(page_url):
    all_products = []
    products_links = get_products_links_form_page(page_url)

    print(f"Found {len(products_links)} products")

    # Loop through all products
    for i, product_link in enumerate(products_links, 1):
        print(f"Processing product {i}/{len(products_links)}")
        
        try:
            product_data = extract_product_data(product_link)
            
            if product_data:
                # Add product data to the list (no need to modify images for now)
                all_products.append(product_data)
                print(f"✓ Done - {product_data.get('title', 'Undefined')}")
            else:
                print("✗ Failed")
                
        except Exception as e:
            print(f"Error: {e}")
            continue

    # Create DataFrame
    df = pd.DataFrame(all_products)

    # Save to CSV
    df.to_csv('products.csv', index=False, encoding='utf-8-sig')

    print(f"\nDone! {len(df)} products saved to products.csv")

In [75]:
def get_products_links_form_page(page_url):
    service = Service(ChromeDriverManager().install())
    browser = webdriver.Chrome(service=service)

    browser.get(page_url)
    for _ in range(16):
        browser.execute_script("window.scrollBy(0, 500)")
        time.sleep(2)


    product_list = browser.find_elements("class name", "css-lzsise")
    html = product_list[0].get_attribute('outerHTML')
    soup = BeautifulSoup(html, 'html.parser')

    links = [a['href'] for a in soup.find_all('a') if a.has_attr('href')]
    links = links[::2]
    links = ["https://www.carrefouregypt.com" + link for link in links]
    return links

In [58]:
def extract_product_data(product_link):

    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9,ar;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br'
    }

    response = requests.get(product_link, headers=HEADERS)
    soup = BeautifulSoup(response.content, 'html.parser')

    script_tag = soup.find("script", id="__NEXT_DATA__")
    data = json.loads(script_tag.string)
    
    try:
        product = data['props']['initialProps']['pageProps']['initialData']['products'][0]
        
        # Basic product information
        id = product.get('id', None)
        title = product.get('title', None)
        url = f"{"https://www.carrefouregypt.com/mafegy/en"}{product.get('url', '')}" if product.get('url') else None
        
        # Attributes
        attributes = product.get('attributes', {})
        productType = attributes.get('productType', None)
        brandName = attributes.get('brandName', None)
        ingredients = attributes.get('ingredients', None)
        size = attributes.get('size', None)
        name_ar = attributes.get('name_ar', None)
        description = attributes.get('description', None)
        marketingText = attributes.get('marketingText', None)
        ean = attributes.get('ean', None)
        productCategoriesHearchi = attributes.get('productCategoriesHearchi', None)
        storeId = attributes.get('storeId', None)
        productTypeDM51 = attributes.get('productTypeDM51', None)
        department = attributes.get('department', None)
        
        # Pricing information
        offers = product.get('offers', [])
        if offers and len(offers) > 0 and 'stores' in offers[0] and len(offers[0]['stores']) > 0:
            store = offers[0]['stores'][0]
            price_info = store.get('price', {})
            
            price_after_discount = price_info.get('value', None)
            
            original_info = price_info.get('original', {})
            price_before_discount = original_info.get('value', None)
            
            discount_info = price_info.get('discount', {})
            if discount_info and 'information' in discount_info and 'amount' in discount_info['information']:
                discount_percentage = discount_info['information']['amount'] / 100
            else:
                discount_percentage = None
        else:
            price_after_discount = None
            price_before_discount = None
            discount_percentage = None
        
        # Images - استخراج كل الصور
        media = product.get('media', [])
        image_urls = []
        if media:
            for item in media:
                if item.get('url'):
                    image_urls.append(item['url'])
        
        # إذا لم توجد صور، اجعل القائمة فارغة
        if not image_urls:
            image_urls = None
        
        # Availability and stock
        availability = product.get('availability', {})
        isAvailable = availability.get('isAvailable', None)
        stockLevel = availability.get('stockLevel', None)
        
        # Additional product details
        maxToOrder = attributes.get('maxToOrder', None)
        minOrderQuantity = attributes.get('minOrderQuantity', None)
        genuineStock = attributes.get('genuineStock', None)
        freeInstallation = attributes.get('freeInstallation', None)
        soldByWeight = attributes.get('soldByWeight', None)
        
        # Physical dimensions
        width = attributes.get('width', None)
        height = attributes.get('height', None)
        depth = attributes.get('depth', None)
        weight = attributes.get('weight', None)
        
        # Status
        status = attributes.get('status', None)
        
        return {
            'id': id,
            'title': title,
            'url': url,
            'productType': productType,
            'brandName': brandName,
            'ingredients': ingredients,
            'size': size,
            'name_ar': name_ar,
            'description': description,
            'marketingText': marketingText,
            'ean': ean,
            'productCategoriesHearchi': productCategoriesHearchi,
            'storeId': storeId,
            'productTypeDM51': productTypeDM51,
            'department': department,
            'price_after_discount': price_after_discount,
            'price_before_discount': price_before_discount,
            'discount_percentage': discount_percentage,
            'image_urls': image_urls,  # قائمة بكل الصور
            'isAvailable': isAvailable,
            'stockLevel': stockLevel,
            'maxToOrder': maxToOrder,
            'minOrderQuantity': minOrderQuantity,
            'genuineStock': genuineStock,
            'freeInstallation': freeInstallation,
            'soldByWeight': soldByWeight,
            'width': width,
            'height': height,
            'depth': depth,
            'weight': weight,
            'status': status
        }
    
    except Exception as e:
        print(f"Error extracting data: {e}")
        return None

# Paths :
"""
Basic Information:
- id = data['props']['initialProps']['pageProps']['initialData']['products'][0]['id']
- title = data['props']['initialProps']['pageProps']['initialData']['products'][0]['title']
- url = data['props']['initialProps']['pageProps']['initialData']['products'][0]['url']

Attributes:
- productType = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['productType']
- brandName = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['brandName']
- ingredients = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['ingredients']
- size = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['size']
- name_ar = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['name_ar']
- description = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['description']
- marketingText = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['marketingText']
- ean = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['ean']
- productCategoriesHearchi = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['productCategoriesHearchi']
- storeId = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['storeId']
- productTypeDM51 = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['productTypeDM51']
- department = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['department']
- maxToOrder = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['maxToOrder']
- minOrderQuantity = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['minOrderQuantity']
- genuineStock = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['genuineStock']
- freeInstallation = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['freeInstallation']
- soldByWeight = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['soldByWeight']
- width = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['width']
- height = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['height']
- depth = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['depth']
- weight = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['weight']
- status = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['status']

Pricing:
- price_after_discount = data['props']['initialProps']['pageProps']['initialData']['products'][0]['offers'][0]['stores'][0]['price']['value']
- price_before_discount = data['props']['initialProps']['pageProps']['initialData']['products'][0]['offers'][0]['stores'][0]['price']['original']['value']
- discount_percentage = data['props']['initialProps']['pageProps']['initialData']['products'][0]['offers'][0]['stores'][0]['price']['discount']['information']['amount']

Images (All images):
- image_urls = [item['url'] for item in data['props']['initialProps']['pageProps']['initialData']['products'][0]['media'] if 'url' in item]

Availability:
- isAvailable = data['props']['initialProps']['pageProps']['initialData']['products'][0]['availability']['isAvailable']
- stockLevel = data['props']['initialProps']['pageProps']['initialData']['products'][0]['availability']['stockLevel']
"""


"\nBasic Information:\n- id = data['props']['initialProps']['pageProps']['initialData']['products'][0]['id']\n- title = data['props']['initialProps']['pageProps']['initialData']['products'][0]['title']\n- url = data['props']['initialProps']['pageProps']['initialData']['products'][0]['url']\n\nAttributes:\n- productType = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['productType']\n- brandName = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['brandName']\n- ingredients = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['ingredients']\n- size = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['size']\n- name_ar = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['name_ar']\n- description = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['description']\n- mar

In [51]:
# Set the maximum number of columns to display
pd.options.display.max_columns = None

# Set the maximum number of rows to display
pd.options.display.max_rows = 50

df

Unnamed: 0,id,title,url,productType,brandName,ingredients,size,name_ar,description,marketingText,ean,productCategoriesHearchi,storeId,productTypeDM51,department,price_after_discount,price_before_discount,discount_percentage,image_urls,isAvailable,stockLevel,maxToOrder,minOrderQuantity,genuineStock,freeInstallation,soldByWeight,width,height,depth,weight,status
0,622319,تيميز حبوب شوكولاتة للافطار - 450 جرام,https://www.carrefouregypt.com/mafegy/en/corn-...,FOOD,Temmy's,,,تيميز حبوب شوكولاتة للافطار - 450 جرام,<ul><li>وجبة متوازنة مثالية</li><li>فطار يصلح ...,<ul><li>وجبة متوازنة مثالية</li><li>فطار يصلح ...,6221012003504,Food Cupboard/Breakfast Cereals &amp; Bars/Cer...,mafegy,express,01,120.95,141.80,0.15,[https://cdn.mafrservices.com/sys-master-root/...,,,10,0,False,False,False,0.07,0.30,0.20,0,APPROVED
1,19321,كيلوجز كورن فليكس سادة - 500 جرام,https://www.carrefouregypt.com/mafegy/en/corn-...,FOOD,Kellogg's,الذرة ، السكر ، نكهة الشعير الشعير ، فيتامينات...,,كيلوجز كورن فليكس سادة - 500 جرام,<p>ابدأ يومك مع حبوب الإفطار الرائعة والمفعمة ...,<ul><li>وجبة متوازنة مثالية</li><li>مناسبة لوج...,4003994111901,Food Cupboard/Breakfast Cereals &amp; Bars/Cer...,mafegy,express,01,246.95,246.95,,[https://cdn.mafrservices.com/sys-master-root/...,,,8,0,False,False,False,0.05,0.28,0.20,0,APPROVED
2,492792,نسكويك حبوب الشوكولاتة - 330 جم,https://www.carrefouregypt.com/mafegy/en/corn-...,FOOD,Nesquik,يحتوي على الحليب وفول الصويا والقمح,,نسكويك حبوب الشوكولاتة - 330 جم,<p>تم صنع حبوب الإفطار من نسكويك باستخدام الحب...,<ul><li>للحصول على وعاء لذيذ من الحبوب</li><li...,5900020030719,Food Cupboard/Breakfast Cereals &amp; Bars/Cer...,mafegy,express,01,115.25,115.25,,[https://cdn.mafrservices.com/pim-content/EGY/...,,,6,0,False,False,False,0.05,0.22,0.15,0,APPROVED
3,93247,كورن فليكس من تيميز - 500جم,https://www.carrefouregypt.com/mafegy/en/corn-...,FOOD,Temmy's,السكر ، الحليب ، سميد الذرة ، ذرة الحبوب الكاملة.,,كورن فليكس من تيميز - 500جم,<p>إن رقائق الذرة الذرة من تيميز غنية بالفيتام...,<ul><li>يساعد محتواه من الكالسيوم على تقوية ال...,6221012505084,Food Cupboard/Breakfast Cereals &amp; Bars/Cer...,mafegy,express,01,95.95,111.20,0.14,[https://cdn.mafrservices.com/sys-master-root/...,,,6,0,False,False,False,0.07,0.30,0.20,0,APPROVED
4,538291,تيميز شوكو بوبس - 500 جرام,https://www.carrefouregypt.com/mafegy/en/corn-...,FOOD,Temmy's,دقيق ذرة، سكر، دقيق الأرز، مسحوق الكاكاو، دقيق...,,تيميز شوكو بوبس - 500 جرام,<p>تعتبر هذه الرقائق مغذية.</p><p>كل قطعة من ه...,<ul> <li> هذه الحبوب متموجة لذيذة من تلقاء نفس...,6221012000213,Food Cupboard/Breakfast Cereals &amp; Bars/Cer...,mafegy,express,01,116.85,127.45,0.08,[https://cdn.mafrservices.com/sys-master-root/...,,,6,0,False,False,False,0.07,0.30,0.20,0,APPROVED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55,494462,جرانولا بار لينو بالتفاح والقرفة - 40 جم,https://www.carrefouregypt.com/mafegy/en/prote...,FOOD,Lino,شراب الذرة ، السكر ، الحليب الخالي من الدسم ، ...,,جرانولا بار لينو بالتفاح والقرفة - 40 جم,<ul><li><p>بار لينو بالقرفة وجرانولا التفاح عب...,<ul><li>املأ وعاء بهذه الجرانولا، واسكب بعض ال...,6224008245131,Food Cupboard/Breakfast Cereals &amp; Bars/Cer...,mafegy,express,01,41.75,41.75,,[https://cdn.mafrservices.com/sys-master-root/...,,,6,0,False,False,False,0.13,0.10,0.13,0,APPROVED
56,594860,ذا جود هابيتس بار بروتين بالكابتشينو - 60 جرام,https://www.carrefouregypt.com/mafegy/en/prote...,FOOD,,بروتين واي مركز، مركز بروتين الحليب، السوربيتو...,70 جرام,ذا جود هابيتس بار بروتين بالكابتشينو - 60 جرام,<p>سناكس البروتين الخفيف مكوناته عالية الجودة ...,<ul><li>سناك بروتين صحي</li><li>غني بالألياف</...,6223012520128,Food Cupboard/Breakfast Cereals &amp; Bars/Cer...,mafegy,express,01,64.65,64.65,,[https://cdn.mafrservices.com/pim-content/EGY/...,,,10,0,False,False,False,0.02,0.05,0.15,0,APPROVED
57,528594,حبوب إفطار لينو بشوكولاتة بافس والشوفان - 375 جم,https://www.carrefouregypt.com/mafegy/en/corn-...,FOOD,Lino,,,حبوب إفطار لينو بشوكولاتة بافس والشوفان - 375 جم,<ul><li>حبوب الفطور المحلاة مع حبوب الشوفان ال...,<ul><li>احصل على حلاوة الفاكهة ومقرمشة المكسرا...,6224008245681,Food Cupboard/Breakfast Cereals &amp; Bars/Cer...,mafegy,express,01,136.00,136.00,,[https://cdn.mafrservices.com/sys-master-root/...,,,6,0,False,False,False,0.06,0.30,0.28,0,APPROVED
58,442952,بروتين بار كاكاومن سانتي، 50 جرام,https://www.carrefouregypt.com/mafegy/en/prote...,FOOD,Sante,,,بروتين بار كاكاومن سانتي، 50 جرام,<p>سانتي جو أون هو بار بروتين لذيذًا مثاليًا أ...,<ul><li>مثالي لأولئك الذين يحبون الرياضة</li><...,5900617013064,Food Cupboard/Breakfast Cereals &amp; Bars/Cer...,mafegy,express,01,66.75,66.75,,[https://cdn.mafrservices.com/sys-master-root/...,,,6,0,False,False,False,0.13,0.10,0.13,0,APPROVED


In [None]:
'''
id = data['props']['initialProps']['pageProps']['initialData']['products'][0]['id']
title = data['props']['initialProps']['pageProps']['initialData']['products'][0]['title']
url = f"{site_url}{data['props']['initialProps']['pageProps']['initialData']['products'][0]['url']}"
productType = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['productType']
brandName = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['brandName']
ingredients = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['ingredients']
size = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['size']
name_ar = data['props']['initialProps']['pageProps']['initialData']['products'][0]['attributes']['name_ar']

price_after_discount = data['props']['initialProps']['pageProps']['initialData']['products'][0]['offers'][0]['stores'][0]['price']['value']
price_before_discount = data['props']['initialProps']['pageProps']['initialData']['products'][0]['offers'][0]['stores'][0]['price']['original']['value']
discount_percentage = data['props']['initialProps']['pageProps']['initialData']['products'][0]['offers'][0]['stores'][0]['price']['discount']['information']['amount']/100
image_url = data['props']['initialProps']['pageProps']['initialData']['products'][0]['media'][0]['url']

print(f"ID: {id}")
print(f"Title: {title}")
print(f"URL: {url}")
print(f"Product Type: {productType}")
print(f"Brand Name: {brandName}")
print(f"Ingredients: {ingredients}")
print(f"Size: {size}")
print(f"Name (Arabic): {name_ar}")
print(f"Price After Discount: {price_after_discount}")
print(f"Price Before Discount: {price_before_discount}")
print(f"Discount Percentage: {discount_percentage}")
print(f"Image URL: {image_url}")
'''





"""
title = soup.find("h1", class_="css-106scfp").text.strip()

# Extract price, discount, old price, and tax info from the soup
price_text = soup.find("h2", class_="css-1i90gmp").text.strip()

# Extract all numbers (prices)
prices = re.findall(r"\d+\.\d+", price_text)
current_price = float(prices[0]) if len(prices) > 0 else None
old_price = float(prices[1]) if len(prices) > 1 else None

# Check if tax info is present
tax_included = "شامل قيمة الضريبة" in price_text

# Extract discount percentage
discount_match = re.search(r"(\d+)%", price_text)
discount = float(discount_match.group(1)) / 100 if discount_match else None

# If discount is None, try to extract from <span class="css-aewju">
if discount is None:
    discount_span = soup.find("span", class_="css-aewju")
    if discount_span:
        discount_match = re.search(r"(\d+)%", discount_span.text)
        discount = float(discount_match.group(1)) / 100 if discount_match else None

print(title)
print(current_price)
print(old_price)
print(tax_included)
print(discount)
"""
