# Lab Exercise 1. Scraping Static Websites


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


This is the warmup task for the first laboratory exercise. It consists of scraping static Websites with BeautifulSoap.

 It should be completed at home and presented at the laboratory.

**Total points: 2**

### Task Description

Scrape the information about the products on the following page:
https://clevershop.mk/product-category/mobilni-laptopi-i-tableti/

For each product scrape:


*   Product title (selector `'.wd-entities-title'`)
*   Product regular price (selector `'.woocommerce-Price-amount'`)
*   Product discount price (if available), same selector as regular price
*   URL to the product page
*   Add to cart button URL

***Help: There are multiple product pages, for each page you need to send a separate request***


Save the results as a DataFrame object

You can add as many code cells as you need.

________________________________________________________________

### Requirements

Import libraries and modules that you are going to use

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

### Send HTTP request to the target Website

In [3]:
url = "https://clevershop.mk/product-category/mobilni-laptopi-i-tableti/"
response = requests.get(url)

check the response status code

In [4]:
if response.status_code == 200:
    print("It is working")
else:
    print("Not working try again code:",response.status_code)

It is working


### Parse the response content with BeautifulSoap

In [5]:
soup = BeautifulSoup(response.content, 'html.parser')

### Extract data from the BeautifulSoap object using any selectors, attribute identifiers, etc.

* Product title (selector '.wd-entities-title')
* Product regular price (selector '.woocommerce-Price-amount')
* Product discount price (if available), same selector as regular price
* URL to the product page
* Add to cart button URL

In [6]:
titles = []
regular_prices = []
discount_prices = []
product_urls = []
add_to_cart_urls = []

Repeat the extraction process for each page of products

In [7]:
products = soup.select(' .site-content')

for product in products:
    title = product.select_one(' .wd-entities-title')
    titles.append(title)

    price_elements = product.select(' .woocommerce-Price-amount')
    regular_price = price_elements[0].text.strip() if price_elements else None
    discount_price = price_elements[1].text.strip() if len(price_elements) > 1 else None
    regular_prices.append(regular_price)
    discount_prices.append(discount_price)

    product_url = product.select_one('.wd-entities-title a')['href']
    product_urls.append(product_url)

    add_to_cart_button = product.select_one('.add_to_cart_button')
    add_to_cart_url = add_to_cart_button['href'] if add_to_cart_button else None
    add_to_cart_urls.append(add_to_cart_url)


In [8]:
 print("Title:", title)
 print("Regular Price:", regular_price)
 print("Discount Price:", discount_price)
 print("Product URL:", product_url)
 print("Add to Cart URL:", add_to_cart_url)

Title: <h3 class="wd-entities-title"><a href="https://clevershop.mk/product/acer-a315-23-a7kd/">Acer A315-23-A7KD</a></h3>
Regular Price: 17.590 ден
Discount Price: 27.490 ден
Product URL: https://clevershop.mk/product/acer-a315-23-a7kd/
Add to Cart URL: ?add-to-cart=21494


In [None]:
page_number = 1
while True:
    page_url = f"https://clevershop.mk/product-category/mobilni-laptopi-i-tableti/page/{page_number}/"
    response = requests.get(page_url)

    if response.status_code != 200:
        print("No more pages found.")
        break

    soup = BeautifulSoup(response.content, 'html.parser')

    products = soup.select('.site-content')
    for product in products:
        title = product.select_one('.wd-entities-title').text.strip()
        titles.append(title)

        price_elements = product.select('.woocommerce-Price-amount')
        if price_elements:
            regular_price = price_elements[0].text.strip()
            regular_prices.append(regular_price)
            discount_price = price_elements[1].text.strip() if len(price_elements) > 1 else None
            discount_prices.append(discount_price)
        else:
            regular_prices.append(None)
            discount_prices.append(None)

        product_url = product.select_one('.wd-entities-title a')['href']
        product_urls.append(product_url)

        add_to_cart_button = product.select_one('.add_to_cart_button')
        add_to_cart_url = add_to_cart_button['href'] if add_to_cart_button else None
        add_to_cart_urls.append(add_to_cart_url)

    page_number += 1
    time.sleep(2)


No more pages found.


In [None]:
 print("Ttiles:",len(titles))
 print("Regular Prices:", len(regular_prices))
 print("Discount Price:", len(discount_prices))
 print("Product URL:", len(product_urls))
 print("Add to Cart URL:", len(add_to_cart_urls))

Ttiles: 30
Regular Prices: 29
Discount Price: 29
Product URL: 29
Add to Cart URL: 29


### Create a pandas DataFrame with the scraped products

In [None]:

missing_data_titles= []
for i in range(len(titles)):
    try:
        regular_price = regular_prices[i]
    except IndexError:
        regular_price = None

    try:
        product_url = product_urls[i]
    except IndexError:
        product_url = None

    if regular_price is None or product_url is None:
        missing_data_titles.append(titles[i])
        print(f"Missing data for Title: '{titles[i]}' | Regular Price: {regular_price} | Product URL: {product_url}")

if missing_data_titles:
    print("\nTitles with Missing Data:")
    for title in missing_data_titles:
        print(title)
else:
    print("No missing data found for any titles.")

Missing data for Title: 'MON 27 LG 27GN850-B QHD IPS 1MS 144HZ HDMI' | Regular Price: None | Product URL: None

Titles with Missing Data:
MON 27 LG 27GN850-B QHD IPS 1MS 144HZ HDMI


In [None]:
valid_titles = []
valid_regular_prices = []
valid_discount_prices = []
valid_product_urls = []
valid_add_to_cart_urls = []

for i in range(len(titles)):
    if (i < len(regular_prices) and regular_prices[i] is not None) and \
       (i < len(product_urls) and product_urls[i] is not None):

        valid_titles.append(titles[i])
        valid_regular_prices.append(regular_prices[i])
        valid_discount_prices.append(discount_prices[i] if i < len(discount_prices) else None)
        valid_product_urls.append(product_urls[i])
        valid_add_to_cart_urls.append(add_to_cart_urls[i] if i < len(add_to_cart_urls) else None)


data = {
    "Product Title": valid_titles,
    "Regular Price": valid_regular_prices,
    "Discount Price": valid_discount_prices,
    "Product URL" : valid_product_urls,
    "Add to Cart URL": valid_add_to_cart_urls
}

df = pd.DataFrame(data)
print(df.head())

                   Product Title Regular Price Discount Price  \
0          [[Acer A315-23-A7KD]]    17.590 ден     27.490 ден   
1          [[Acer A315-23-A7KD]]    17.590 ден     27.490 ден   
2              Acer A315-23-A7KD    66.990 ден     73.990 ден   
3   Acer Nitro 5 Gaming AN515-45    49.990 ден     39.990 ден   
4  Apple iPhone 12 128GB 4GB RAM    16.590 ден     20.490 ден   

                                         Product URL     Add to Cart URL  
0   https://clevershop.mk/product/acer-a315-23-a7kd/  ?add-to-cart=21494  
1   https://clevershop.mk/product/acer-a315-23-a7kd/  ?add-to-cart=21494  
2  https://clevershop.mk/product/acer-nitro-5-gam...  ?add-to-cart=21410  
3  https://clevershop.mk/product/apple-iphone-12-...  ?add-to-cart=20023  
4  https://clevershop.mk/product/dell-238-p2421d-...  ?add-to-cart=17338  


Save the dataframe as `.csv`

In [None]:
df.to_csv("clevershop.csv", index=False)
print("It is DONE FINALLY")

It is DONE FINALLY
