In [None]:
pip install bs4

In [5]:
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd

def scrape_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    products = soup.find_all('div', class_='GRID_ITEM')
    page_results = []

    for product in products:
        # Extracting item name and link
        product_title_div = product.find('div', class_='product-title')
        if product_title_div and product_title_div.find('a'):
            name = product_title_div.find('a').get_text(strip=True)
            link = 'https://www.varle.lt' + product_title_div.find('a')['href']
        else:
            name, link = None, None

        # Extracting storage space in TB
        storage_space = re.search(r'(\d+)\s*TB', name)
        storage_space = storage_space.group(1) if storage_space else None

        # Extracting price
        price_div = product.find('div', class_='price-value')
        if price_div:
            price_main = price_div.find('span').get_text(strip=True) if price_div.find('span') else ''
            price_decimal = price_div.find('sup').get_text(strip=True) if price_div.find('sup') else ''
            price = price_main + price_decimal
        else:
            price = None

        page_results.append({
            'Item Name': name,
            'Direct Link': link,
            'Storage Space (TB)': storage_space,
            'Price': price
        })

    return page_results

def scrape_varle_store(base_url, pages):
    results = []
    for page in range(1, pages + 1):
        # Constructing the URL for each page
        url = f'{base_url}&p={page}'
        results.extend(scrape_page(url))

        # Delay for 1 second before scraping the next page
        if page < pages:
            time.sleep(1)

    return results

# Base URL (without the page parameter)
base_url = 'https://www.varle.lt/isoriniai-kietieji-diskai-hdd/?f.s-bendra_disko_talpa=4%20TB&f.s-bendra_disko_talpa=8%20TB&f.s-bendra_disko_talpa=5%20TB&f.s-bendra_disko_talpa=18%20TB&f.s-bendra_disko_talpa=12%20TB&f.s-bendra_disko_talpa=14%20TB&f.s-bendra_disko_talpa=16%20TB&f.s-bendra_disko_talpa=20%20TB&f.s-bendra_disko_talpa=3%20TB&f.s-bendra_disko_talpa=6%20TB&f.s-bendra_disko_talpa=10%20TB&f.s-bendra_disko_talpa=36%20TB&sort=-discount_sum'

# Number of pages to scrape
num_pages = 3


# Scraping the data
data = scrape_varle_store(base_url, num_pages)

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'Price' and 'Storage Space (TB)' to numeric for calculation
df['Price'] = df['Price'].replace(r'[^\d.]', '', regex=True).astype(float, errors='ignore')
df['Storage Space (TB)'] = pd.to_numeric(df['Storage Space (TB)'], errors='coerce')

# Calculate 'Price per TB'
df['Price per TB'] = df['Price'] / df['Storage Space (TB)']

# Remove entries with NaN or Infinite values in 'Price per TB'
df = df[df['Price per TB'].notna() & (df['Price per TB'] != float('inf'))]

# Sorting by 'Price per TB'
df_sorted = df.sort_values(by='Price per TB')

# Print the sorted DataFrame
print(df_sorted)

KeyError: 'Price'