In [38]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.common.exceptions import TimeoutException

## Scraping Product URLs From Pages

In [96]:
no_of_pages = 107  # Set the number of pages you want to scrape
base_url = "https://www.farfetch.com/pk/shopping/women/jackets-1/items.aspx?page="

product_urls = []


# Iterate over each page
for page in range(1, no_of_pages + 1):
    url = f"{base_url}{page}"
    try:
        print(f"Scraping page {page}...")
        # # Load the page
        # driver = webdriver.Chrome()
        # driver.set_page_load_timeout(8)  # Set page load timeout to 8 seconds

        # try:
        #     driver.get(url)
        # except TimeoutException:
        #     pass  # Handle timeout exception, page will not load fully

        # # Get the page source after it's loaded
        # html = driver.page_source

        # # Quit the driver after scraping
        # driver.quit()

        # Parse the HTML using BeautifulSoup
        # soup = BeautifulSoup(html, 'html.parser')
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0'})
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all product links
        lis = soup.find_all('li', attrs={'data-testid': 'productCard'})

        # Append the product URLs to the list
        urls_count = 0
        for li in lis:
            try:
                product_urls.append(li.find('a')['href'])
                urls_count += 1
            except:
                pass
        print(f"Found {urls_count} product URLs on page {page}")
    except Exception as e:
        print(f"An error occurred while scraping page {page}: {e}")

# Create a DataFrame of the product URLs
products_df = pd.DataFrame(product_urls, columns=['Product Links'])
print(f"Total Products Found: {len(products_df)}")

# Saving product links to csv
products_df.to_csv('farfetch_women_jackets_product_pages_urls.csv', index=False)
products_df = pd.read_csv('farfetch_women_jackets_product_pages_urls.csv')

Scraping page 1...
Found 11 product URLs on page 1
Scraping page 2...
Found 12 product URLs on page 2
Scraping page 3...
Found 12 product URLs on page 3
Scraping page 4...
Found 12 product URLs on page 4
Scraping page 5...
Found 12 product URLs on page 5
Scraping page 6...
Found 12 product URLs on page 6
Scraping page 7...
Found 12 product URLs on page 7
Scraping page 8...
Found 12 product URLs on page 8
Scraping page 9...
Found 12 product URLs on page 9
Scraping page 10...
Found 12 product URLs on page 10
Scraping page 11...
Found 12 product URLs on page 11
Scraping page 12...
Found 12 product URLs on page 12
Scraping page 13...
Found 12 product URLs on page 13
Scraping page 14...
Found 12 product URLs on page 14
Scraping page 15...
Found 12 product URLs on page 15
Scraping page 16...
Found 12 product URLs on page 16
Scraping page 17...
Found 12 product URLs on page 17
Scraping page 18...
Found 12 product URLs on page 18
Scraping page 19...
Found 12 product URLs on page 19
Scraping pa

In [97]:
products_df.head()

Unnamed: 0,Product Links
0,/pk/shopping/women/dries-van-noten-cropped-woo...
1,/pk/shopping/women/brunello-cucinelli-sequin-e...
2,/pk/shopping/women/versace-double-breasted-twe...
3,/pk/shopping/women/prada-single-breasted-pinst...
4,/pk/shopping/women/moncler-lampusa-denim-hoode...


In [98]:
products_df.nunique()

Product Links    971
dtype: int64

## Scraping Images From Each Product

In [20]:
proxies_df = pd.read_csv('proxies.csv')
proxies_df.head()

Unnamed: 0,IP,Port,Protocol
0,184.168.121.153,57421.0,SOCKS5
1,200.143.99.122,3128.0,HTTP
2,125.16.181.179,9988.0,HTTP
3,125.16.181.180,9988.0,HTTP
4,129.153.42.81,3128.0,HTTP


In [22]:
proxiess = {
   'http': 'http://184.168.121.153:57421',
   'https': 'http://200.143.99.122:3128',
}

In [16]:
proxies_dict = {}
for index, row in proxies_df.iterrows():
    if row['Protocol '] == 'HTTP ':
        proxies_dict['http'] = f'http://{row["IP "]}:{int(row["Port "])}'
        proxies_dict['https'] = f'http://{row["IP "]}:{int(row["Port "])}'
    elif row['Protocol '] == 'SOCKS5 ':
        proxies_dict['http'] = f'socks5://{row["IP "]}:{int(row["Port "])}'
        proxies_dict['https'] = f'socks5://{row["IP "]}:{int(row["Port "])}'

print(proxies_dict)


{'http': 'http://72.10.160.92 :1451', 'https': 'http://72.10.160.92 :1451'}


In [23]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

products_df = pd.read_csv('farfetch_women_jackets_product_pages_urls.csv')

images_urls = []

for i, url in enumerate(list(products_df['Product Links'][:10])):
    full_url = f"https://www.farfetch.com{url}"
    print(full_url)
    response = requests.get(full_url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; Storebot-Google/1.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Safari/537.36'}, proxies=proxiess)
    soup = BeautifulSoup(response.text, 'html.parser')
    print(soup.prettify())
    divs = soup.find_all('div', class_='ltr-bjn8wh ed0fyxo0')
    j = 0
    for div in divs:
        try:
            images_urls.append([i+1, j+1, div.find('img')['src']])
            j += 1
            print(f"Fetched product no# {i+1} image no# {j+1} url.")
        except Exception as e:
            print(f"An error occurred while fetching product no# {i+1} image no# {j+1} url: {e}")
    divs = None

images_df = pd.DataFrame(images_urls, columns=['Product', 'Image No#', 'Product URL'])
images_df.to_csv('farfetch_women_jackets_product_images_urls.csv', index=False)

images_df.head(20)

https://www.farfetch.com/pk/shopping/women/dries-van-noten-cropped-wool-blend-bomber-jacket-item-22924009.aspx?storeid=15268


ProxyError: HTTPSConnectionPool(host='www.farfetch.com', port=443): Max retries exceeded with url: /pk/shopping/women/dries-van-noten-cropped-wool-blend-bomber-jacket-item-22924009.aspx?storeid=15268 (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden')))

In [92]:
#download images
import requests

with open("image.jpg", "wb") as f:
    f.write(requests.get(images[0]).content)

In [43]:
# urlhm = "https://www2.hm.com/en_us/productpage.1130141010.html"
# response = requests.get(urlhm, headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0'})
# soup = BeautifulSoup(response.text, 'html.parser')
# find everything that has class pdp-image
imgs = soup.find_all('figure')
imgs

[<figure class="pdp-image product-detail-images product-detail-main-image"><div class="product-detail-main-image-container">
 <img alt="Relaxed Fit Denim Jacket - Dark gray denim - Men | H&amp;M US" height="1152" sizes="(max-width: 767px) 100vw, 50vw" src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2F40%2F83%2F40831c5db0080e1c83b46cdc71c8f9156681ce90.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5B%5D%2Ctype%5BLOOKBOOK%5D%2Cres%5Bm%5D%2Chmver%5B1%5D&amp;call=url[file:/product/main]" srcset="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2F40%2F83%2F40831c5db0080e1c83b46cdc71c8f9156681ce90.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5B%5D%2Ctype%5BLOOKBOOK%5D%2Cres%5Bm%5D%2Chmver%5B1%5D&amp;call=url[file:/product/main] 396w,
 		//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2F40%2F83%2F40831c5db0080e1c83b46cdc71c8f9156681ce90.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5B%5D%2Ctype%5BLOOKBOOK%5D%2Cres%5Bm%5D%2Chmver%5B1%5D&amp;call=url[file:/product/main] 564w, 
         //lp2.hm.c