In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
from selenium import webdriver
import time
import os
import requests

## Skroutz furnitures

In [None]:
# Set up Selenium WebDriver
main_link = 'https://www.skroutz.gr/c/1114/living_room.html?o=furniture'
driver = webdriver.Chrome()
driver.get(main_link)

# Get the page source from Selenium
page_source = driver.page_source

# Close the Selenium WebDriver
driver.quit()

# Parse the page source using Beautiful Soup
soup = BeautifulSoup(page_source, "html.parser")

In [None]:
# find all a elements that correspond to different kinds of living room furnitures
a_elements = soup.find_all('a', href=lambda href: href and ".html?from=image_driven_subcats&o=furniture" in href)

# get their links  to loop over them later
sub_links = ['https://www.skroutz.gr' + i['href'] for i in a_elements]

In [None]:
sub_links

## Loop for each sub link

In [None]:
link_list = []
for i in sub_links:
    # Set up Selenium WebDriver
    main_link = i
    category = i.split('/')[-1].split('.')[0]
    driver = webdriver.Chrome()
    driver.get(main_link)

    # Get the page source from Selenium
    page_source = driver.page_source

    # Close the Selenium WebDriver
    driver.quit()

    # Parse the page source using Beautiful Soup
    soup = BeautifulSoup(page_source, "html.parser")
    
    # find max pages
    a_elements = soup.find_all('a', href=lambda href: href and f"{category}.html?o=furniture&page=" in href)

    # Extract the text from each <a> element
    text_list = [int(a['href'].split('=')[2]) for a in a_elements if len(a)>0]
    
    max_pages = max(text_list)
    print(max_pages)
    
    # replicate links
    for j in range(1, int(max_pages) + 1):
        """ the final for loop will be called on this list and 
        the webdriver will open and get the source"""
        link_list.append(main_link + '&page='+ str(j))

In [None]:
def get_data(x):

    furns = pd.DataFrame()

    for i in x:
        try:
            product_name = i.find('a', class_='js-sku-link')['title']
        except:
            product_name = np.nan
        try: 
            img = 'https:' + i.find('img', {'alt':product_name})['src']
        except:
            img = np.nan

        temp = pd.DataFrame([{
            'product_name': product_name,
            'product_image': img,
            'category': category
        }])

        furns = pd.concat([furns, temp], ignore_index=True)

    return furns

In [None]:
furns = pd.DataFrame()
for i in tqdm(link_list):
    # Set up Selenium WebDriver
    main_link = i
    category = i.split('/')[-1].split('.')[0]
    driver = webdriver.Chrome()
    driver.get(main_link)

    # Get the page source from Selenium
    page_source = driver.page_source

    # Close the Selenium WebDriver
    driver.quit()

    # Parse the page source using Beautiful Soup
    soup = BeautifulSoup(page_source, "html.parser")
    
    #define row element for looping inside every link
    results = soup.select('li.cf.card')
    
    temp = pd.concat([get_data(results)])
    furns = pd.concat([furns, temp], ignore_index=True)
    print(furns.shape[0])

In [None]:
cat_map = {
    'living_room_small_tables': 'small_table',
    'kanapedes': 'sofa',
    'vivlothikes': 'bookcase',
    'polithrones' : 'armchair',
    'epiplo-tv' : 'tv-set',
    'Suntheta-Saloniou' : 'composites',
    'Vitrines-Saloniou' : 'showcases',
    'living_room_sets' : 'living_room_sets',
    'anaklintra' : 'recliners'
}

In [None]:
furns['category'] = furns['category'].map(cat_map)

In [None]:
furns.drop_duplicates().reset_index(drop=True).to_csv('furns.csv',index=False)

## Download the images

In [None]:
# Define the target number of images per category
target_per_category = 6000

# Initialize counters for each category
category_counters = {
    "sofa": 0,
    "small_table": 0
}

# Loop through the dataset
for i in c.values:
    image_url = i[1]
    category = i[2]

   # Check if the category is not "sofas" or "small_tables"
    if category not in ["sofa", "small_table"]:
        continue

    # Check if the category counter has reached the target
    if category_counters[category] >= target_per_category:
        continue

    # Create a folder for the category if it doesn't exist
    category_folder = os.path.join("images", category)
    if not os.path.exists(category_folder):
        os.makedirs(category_folder)

    # Extract the filename
    filename = f"{category}_{category_counters[category]}.jpeg"

    # Download the image and save it to the category folder
    response = requests.get(image_url)
    image_path = os.path.join(category_folder, filename)
    with open(image_path, "wb") as f:
        f.write(response.content)

    # Increment the category counter
    category_counters[category] += 1

    print(f"Image downloaded: {image_path}")
    time.sleep(np.random.randint(0,5))
    
    # Check if the target has been reached for all categories
    if all(count >= target_per_category for count in category_counters.values()):
        break