In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import os
import requests
from urllib.parse import urljoin


## Webcraping of the website www.pieces-euro.tv

In [2]:
# Instantiate the connection from the website
url = 'https://www.pieces-euro.tv/'
r = requests.get(url)
print(r.status_code)

200


### Gets the list of all countries and their respective euro coins

In [3]:
# Create the soup
soup = BeautifulSoup(r.text, 'html.parser')
# find specific 'id' from the soup
usermenu = soup.find(id='usermenu')
# find in a list all the href all the 'a' tags
links = usermenu.find_all('a')
# extract only the href of each 'a' tag
links = [link.get('href') for link in links]
#Create a new list with "\n" characters removed
links = [link.replace('\n', '') for link in links]
print(links)

['https://www.pieces-euro.tv/allemagne/pieces-euro-berlin-2023', 'https://www.pieces-euro.tv/andorre/pieces-euro-2022', 'https://www.pieces-euro.tv/autriche/pieces-euro-2023', 'https://www.pieces-euro.tv/belgique/pieces-euro-2022', 'https://www.pieces-euro.tv/chypre/pieces-euro-2022', 'https://www.pieces-euro.tv/croatie/pieces-euro-2023', 'https://www.pieces-euro.tv/espagne/pieces-euro-2023', 'https://www.pieces-euro.tv/estonie/pieces-euro-2022', 'https://www.pieces-euro.tv/finlande/pieces-euro-2023', 'https://www.pieces-euro.tv/france/pieces-euro-2023', 'https://www.pieces-euro.tv/grece/pieces-euro-2022', 'https://www.pieces-euro.tv/irlande/pieces-euro-2022', 'https://www.pieces-euro.tv/italie/pieces-euro-2023', 'https://www.pieces-euro.tv/lettonie/pieces-euro-2022', 'https://www.pieces-euro.tv/lituanie/pieces-euro-2023', 'https://www.pieces-euro.tv/luxembourg/pieces-euro-2023', 'https://www.pieces-euro.tv/malte/pieces-euro-2022', 'https://www.pieces-euro.tv/monaco/pieces-euro-2022', 

### Iterates over the list of countries and gets the urls of coins by years

In [4]:
full_list = []

for url in links:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    menuyeartree = soup.find_all(class_='menuyeartree')
    menutree = soup.find_all(class_='menutree')
    menuactive = soup.find_all(class_='navi_dropdown-content') # get 2023 coins
    
    href = []
    
    if menuyeartree:
        href = [link.get('href') for menu in menuyeartree for link in menu.find_all('a')]
    elif menutree:
        href = [link.get('href') for menu in menutree for link in menu.find_all('a')]
    elif menuactive:
        href = [link.get('href') for menu in menuactive for link in menu.find_all('a')]
        
        # Increment the URLs in menuactive
        href = [urljoin(url, link) for link in href]
        
        # Add href to full_list
        full_list.append(href)
        
    if href:
        year = None
        try:
            year = int(url.split('-')[-1])
        except ValueError:
            print(f"Invalid URL format: {url}")
            continue

        if year >= 2001:
            new_year = str(year + 1)
            new_url = url.replace('-' + str(year) + '-', '-' + new_year + '-')
            full_list.append(href)
    else:
        print('No href found')

# print(full_list)


Invalid URL format: https://www.pieces-euro.tv/euro-starter-kit
Invalid URL format: https://www.pieces-euro.tv/pieces-2-euros/2023


### Once the list of urls are obtained, iterates on the list of urls and get all coins infos (price, title, tirage, picture_url)

In [5]:
def scrape_items(full_list: list):
    """Function to scrape the items from the website

    Args:
        full_list (list): Lists containing the urls to scrape

    Returns:
        list: List of dictionaries containing the information of each item
    """
    results = []

    for sublist in full_list:
        for url in sublist:
            r = requests.get(url)
            soup = BeautifulSoup(r.text, 'html.parser')
            cat_itembox = soup.find_all(class_='cat_itembox')

            if cat_itembox:
                for element in cat_itembox:
                    price_element = element.find('p', class_='cat_preis')
                    title_element = element.find('p', class_='cat_titel')
                    tirage_element = element.find('div', class_='cat_info')

                    if price_element and title_element and tirage_element:
                        price = price_element.text.strip()
                        title = title_element.text.strip()
                        tirage = tirage_element.text.strip()
                        picture_div = element.find('div', class_='cat_preuser')

                        if picture_div:
                            img = picture_div.find('img')
                            if img and 'data-original' in img.attrs:
                                picture_url = img['data-original']
                                base = 'https://www.pieces-euro.tv/'
                                picture_url = urljoin(base, picture_url)
                            else:
                                picture_url = 'No picture found'
                        else:
                            picture_url = 'No picture_div found'
                        
                        # Store the information in a dictionary
                        item_info = {
                            'price': price,
                            'title': title,
                            'tirage': tirage,
                            'picture_url': picture_url
                        }

                        results.append(item_info)
    return results

### Script to extract data

In [6]:
for url in links:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    menuyeartree = soup.find_all(class_='menuyeartree')
    menutree = soup.find_all(class_='menutree')
    menuactive = soup.find_all(class_='navi_dropdown-content')

    href = []

    if menuyeartree:
        href = [link.get('href') for menu in menuyeartree for link in menu.find_all('a')]
    elif menutree:
        href = [link.get('href') for menu in menutree for link in menu.find_all('a')]
    elif menuactive:
        href = [link.get('href') for menu in menuactive for link in menu.find_all('a')]

        # Increment the URLs in menuactive
        href = [urljoin(url, link) for link in href]

        # Add href to full_list
        full_list.append(href)

scraped_items = scrape_items(full_list)

In [7]:
# Verify that coins froms 2023 are in the list
urls_with_2023 = [url for sublist in full_list for url in sublist if "2023" in url]
print(urls_with_2023)

['https://www.pieces-euro.tv/allemagne/pieces-euro-berlin-2023', 'https://www.pieces-euro.tv/autriche/pieces-euro-2023', 'https://www.pieces-euro.tv/croatie/pieces-euro-2023', 'https://www.pieces-euro.tv/espagne/pieces-euro-2023', 'https://www.pieces-euro.tv/finlande/pieces-euro-2023', 'https://www.pieces-euro.tv/france/pieces-euro-2023', 'https://www.pieces-euro.tv/italie/pieces-euro-2023', 'https://www.pieces-euro.tv/lituanie/pieces-euro-2023', 'https://www.pieces-euro.tv/luxembourg/pieces-euro-2023', 'https://www.pieces-euro.tv/pays-bas/pieces-euro-2023', 'https://www.pieces-euro.tv/portugal/pieces-euro-2023', 'https://www.pieces-euro.tv/saint-marin/pieces-euro-2023', 'https://www.pieces-euro.tv/slovaquie/pieces-euro-2023', 'https://www.pieces-euro.tv/vatican/pieces-euro-2023', 'https://www.pieces-euro.tv/pieces-2-euros/2023', 'https://www.pieces-euro.tv/allemagne/pieces-euro-berlin-2023', 'https://www.pieces-euro.tv/autriche/pieces-euro-2023', 'https://www.pieces-euro.tv/croatie/pi

In [8]:
# Save the results in a dataframe
results_df = pd.DataFrame(scraped_items)
results_df.head()

# save the results in a parquet file for later use 
results_df.to_parquet('results.parquet')

In [20]:
# access to all the picture_url in the results list and save all the url in a list
picture_urls = [item['picture_url'] for item in scraped_items]


### Function to download coins pictures locally

In [11]:
def download_images(picture_urls, folder_name):
    """
    Downloads images from a list of picture URLs and saves them to a specified folder.

    Args:
        picture_urls (list): A list of picture URLs.
        folder_name (str): The name of the folder to save the images in.

    Returns:
        None
    """
    os.makedirs(folder_name, exist_ok=True)

    for index, picture_url in enumerate(picture_urls):
        if not picture_url or picture_url == 'No picture found' or len(picture_url) <= 20:
            print("Invalid URL. Skipping to the next one.")
            continue

        try:
            response = requests.get(picture_url)

            if response.status_code == 200:
                filename = picture_url.split('/')[-1]
                file_path = os.path.join(folder_name, filename)
                with open(file_path, 'wb') as file:
                    file.write(response.content)

                #print("Image downloaded and saved:", file_path)
            else:
                print("Failed to download the image:", picture_url)
        except requests.exceptions.MissingSchema:
            print("Invalid URL. Skipping to the next one.")
            continue

download_images(picture_urls, 'images')

Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.


## Similiar process of Webscraping for valuable 2 euros coins in the same website

* The code loops through each element in the HTML with the class `cat_itembox`,it extracts information about the coins, and stores it in a dictionary. 
The code appends each dictionary to a list, and then creates a pandas dataframe from the list. 

* The resulting dataframe contains information about the price, title, tirage, and picture URL of each valuable 2 euro coin.

In [12]:
url_valuables ='https://www.pieces-euro.tv/pieces-2-euros-valorisees'
r = requests.get(url_valuables)
soup = BeautifulSoup(r.text, 'html.parser')

list = []

for element in soup.find_all(class_='cat_itembox'):
    tirage = element.find('div', class_='cat_info')
    # get the first element inside tirage
    #tirage = tirage.find('p')
    picture_div = element.find('div', class_='cat_preuser')

    if picture_div:
        img = picture_div.find('img')
        if img:
            picture_url = img['data-original']
            base = 'https://www.pieces-euro.tv'
            picture_url = base + picture_url
        # check if the picture_url is valid and does not finish by "noimage.png"
            if picture_url.endswith('noimage.png'):
                picture_url = 'No picture found'
        else:
            picture_url = 'No picture found'


    
    for cat_catbox in element.find_all(class_='cat_catbox'):
        # get the elements inside price
        price_tag = cat_catbox.find('p', class_='cat_preis')
        price = price_tag.get_text().split(',')[0]
        
        title = cat_catbox.find('p', class_='cat_titel')
        title = title.text.strip() if title else 'No title found'
        
        
        # Extract the text from the elements
        tirage_text = tirage.text.strip() if tirage else 'No tirage found'

        
        # Store the information in a dictionary
        item_info = {
            'price': price,
            'title': title,
            'tirage': tirage_text,
            'picture_url': picture_url,
        }
        
        list.append(item_info)

# Create a pandas dataframe from the results list
two_euros_df = pd.DataFrame(list)
two_euros_df.head()

Unnamed: 0,price,title,tirage,picture_url
0,4179,Monaco 2 Euro commémorative 2007 - 25e anniver...,Tirage: 20.001 BU dans coffret original | On ...,https://www.pieces-euro.tv/img05/thumb/Monaco-...
1,3837,Monaco 2 Euro commémorative 2015 - 800e annive...,Tirage: 10.000 | BE dans coffret original,https://www.pieces-euro.tv/img01/thumb/Monaco-...
2,2076,Lituanie 2 Euro - UNESCO - Réserve biosphériqu...,Tirage: 500 (approx.) | BU en coincard | Au l...,https://www.pieces-euro.tv/img02/thumb/Lituani...
3,1483,Luxembourg Série 2 Euro commémoratives 2008 - ...,Tirage: 2.500 BE | contient les 6 x 2 Euro co...,https://www.pieces-euro.tv/img03/thumb/Luxembo...
4,1176,Pays-Bas 2 Euro commémorative 2015 - 30 ans du...,Tirage: 1.000 | contient 4 x 2 euro commémorat...,https://www.pieces-euro.tv/img05/thumb/Pays-Ba...


In [13]:
# save the results in a parquet file for later use

two_euros_df.to_parquet('two_euros_df.parquet')


In [14]:
picture_urls = [item['picture_url'] for item in list]
download_images(picture_urls, 'images')

Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to the next one.
Invalid URL. Skipping to 

### Similiar Process for another data source website https://www.florinus.lt/numismatics/euro-coins/?sort_by=time_created
TO DO : Use selenium to webscrape the website

In [15]:
url_valuables ='https://www.florinus.lt/numismatics/euro-coins'
r = requests.get(url_valuables)
soup = BeautifulSoup(r.text, 'html.parser')

list = []

# get the div with the class 'item dynamic_products'

main_dic = soup.find(class_='list_products')
# 

# find all the class name 'name' inside the class 'list_products_slide active'

#name = show.find(class_='name')

#show = show.find_all(class_='name')
    
    #names = element.find('a', class_='name')
    #name = names.text.strip() if names else 'No name found'
    # store name in a list
    

In [16]:
# find all the classes in soup and display them in a list

main_dic

In [17]:
#     if picture_div:
#         img = picture_div.find('img')
#         if img:
#             picture_url = img['data-original']
#             base = 'https://www.pieces-euro.tv'
#             picture_url = base + picture_url
#         # check if the picture_url is valid and does not finish by "noimage.png"
#             if picture_url.endswith('noimage.png'):
#                 picture_url = 'No picture found'
#         else:
#             picture_url = 'No picture found'


    
#     for cat_catbox in element.find_all(class_='cat_catbox'):
#         # get the elements inside price
#         price_tag = cat_catbox.find('p', class_='cat_preis')
#         price = price_tag.get_text().split(',')[0]
        
#         title = cat_catbox.find('p', class_='cat_titel')
#         title = title.text.strip() if title else 'No title found'
        
        
#         # Extract the text from the elements
#         tirage_text = tirage.text.strip() if tirage else 'No tirage found'

        
#         # Store the information in a dictionary
#         item_info = {
#             'price': price,
#             'title': title,
#             'tirage': tirage_text,
#             'picture_url': picture_url,
#         }
        
#         list.append(item_info)

# # Create a pandas dataframe from the results list
# two_euros_df = pd.DataFrame(list)
# two_euros_df.head()