# tests of scrapping

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
url = "https://www.pascalcoste-shopping.com/esthetique/fond-de-teint.html"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')


In [3]:
brand = soup.find('div', class_='uk-grid uk-grid-small small-label uk-grid-divider uk-flex-center').text.strip()

In [4]:
brand

'Parisax Pro 30 ml'

In [5]:
name = soup.find('h3', class_='product-name uk-margin-top').text.strip()


In [6]:
price = float(soup.find('span', class_='uk-price').text.replace('€', '').strip().replace(',', '.'))

In [7]:
prod_url = soup.find('a', class_='uk-position-cover uk-cover-link-product')['href']



In [8]:
img_url = soup.find('img', class_='uk-position-cover uk-cover-link-product')['href']

TypeError: 'NoneType' object is not subscriptable

In [None]:
brand = soup.find('div', class_='uk-width-expand uk-first-column')

In [None]:
soup.find('h3', class_='product-name uk-margin-top').text.strip()

In [None]:
product_elements = soup.find_all('div', class_='uk-grid')
print(f"Found {len(product_elements)} product(s) on the page.")

# SCRAPPING FUNCTION

In [9]:
import requests
from bs4 import BeautifulSoup

def scrape_product_info(url):
    # Define user-agent headers
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
    }
    page = 1
    products = []
    while page:
      # Send GET request to the URL
      response = requests.get(url+"?p="+str(page), headers=headers)

      # Parse the HTML content
      soup = BeautifulSoup(response.content, 'html.parser')



      # Extract product details
      product_elements = soup.find("div", id='uk-product-list-container').find_all("div", class_="uk-panel uk-position-relative")

      print(f"Found {len(product_elements)} product(s) on the page.")
      if len(product_elements) == 0:
        break
      for product in product_elements:
          name_element = product.find('h3', class_='product-name uk-margin-top')
          price_element = product.find('span', class_='uk-price')
          brand_element = product.find("div", class_="uk-position-relative").find("div", class_="uk-width-expand")
          prod_url_element = product.find('a', class_='uk-position-cover uk-cover-link-product')['href']
          img_url_element = soup.find('img', class_='product-image-photo')['src']

          if name_element is not None and price_element is not None:
              name = name_element.text.strip()
              price = float(price_element.text.replace('€', '').strip().replace(',', '.'))
              brand = brand_element.text
              prod_url = prod_url_element
              img_url = img_url_element

              product_info = {
                  'name': name,
                  'price': round(price, 2),
                  'brand': brand,
                  'prod_url': prod_url,
                  'img_url': img_url,
              }

              products.append(product_info)
          else:
              print("Could not find name or price element for a product.")
      page += 1

    print(len(products))
    return products





## execution

In [10]:
url = 'https://www.pascalcoste-shopping.com/esthetique/fond-de-teint.html'
products_info = scrape_product_info(url)
print(products_info)

Found 36 product(s) on the page.
Found 36 product(s) on the page.
Found 36 product(s) on the page.
Found 4 product(s) on the page.
Found 0 product(s) on the page.
112
[{'name': 'Fond de teint Matifiant Amande', 'price': 18.4, 'brand': 'Parisax Pro', 'prod_url': 'https://www.pascalcoste-shopping.com/fond-de-teint-matifiant-amande.html', 'img_url': 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP6zwAAAgcBApocMXEAAAAASUVORK5CYII='}, {'name': 'Fond de teint Matifiant Beige Naturel', 'price': 18.4, 'brand': 'Parisax Pro', 'prod_url': 'https://www.pascalcoste-shopping.com/fond-de-teint-matifiant-beige-naturel.html', 'img_url': 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP6zwAAAgcBApocMXEAAAAASUVORK5CYII='}, {'name': 'Fond de Teint Correcteur Fluide 10.5 Toleriane Maquillage La Roche Posay', 'price': 21.5, 'brand': 'La Roche-Posay', 'prod_url': 'https://www.pascalcoste-shopping.com/fond-de-teint-correcteur-fluide-10-5-tole

In [12]:
# Function to save extracted product information to a JSON file
import json
def save_to_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

if __name__ == "__main__":

    save_to_json(products_info, 'products.json')

## DataBase connection 

In [1]:
import sqlite3
import json

# Function to create database tables
def create_tables():
    conn = sqlite3.connect('products.db')
    cursor = conn.cursor()

    # Create products table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS products (
            id INTEGER PRIMARY KEY,
            name TEXT,
            price REAL,
            brand TEXT,
            imageUrl TEXT,
            productUrl TEXT
        )
    ''')

    conn.commit()
    conn.close()

In [4]:
# Function to insert product data into the database
def insert_data(products):
    conn = sqlite3.connect('products.db')
    cursor = conn.cursor()

    for product in products:
        cursor.execute('''
            INSERT INTO products (name, price, brand, imageUrl, productUrl)
            VALUES (?, ?, ?, ?, ?)
        ''', (product['name'], product['price'], product['brand'], product['img_url'], product['prod_url']))

    conn.commit()
    conn.close()

if __name__ == "__main__":
    create_tables()
    with open('products.json') as f:
        products = json.load(f)
    insert_data(products)


## DOCKER
