In [1]:
# libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

import re

In [2]:
# Define the URL to scrape
base_url = 'https://www.naturabuy.fr/Munitions-Balles-22LR-cat-884.html'
page_number = 1

# Create an empty list to store the scraped data
data = []

# Loop through the first two pages of the website
while page_number <= 2:

    # Construct the URL for the current page
    url = base_url + f'?PAGE={page_number}'

    # Make a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the response using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the item cards on the page
    cards = soup.find_all('a', class_='itemcard')

    # Loop through the item cards and scrape the information
    for card in cards[:5]:

        # Get the href attribute of the item card and construct the URL for the product page
        product_url = 'https://www.naturabuy.fr/' + card['href'].lstrip('/')

        # Make a GET request to the product page
        response = requests.get(product_url)

        # Parse the HTML content of the response using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        ##### Scrape the product name from the title tag
        try:
            product_name = soup.find('title').text.strip()
        except:
            product_name = 'N/A'
            
        # Scrape the manufacturer
        try:
            manufacturer_element = soup.select_one("html:-soup-contains('Marque :') body div#contall div#body_container div#body_container_in div#PAGE div#Columns div#mainProduct div#productWrapper div#blocGallery div#productCriteres div.critere div.criterevalue")
            if manufacturer_element:
                manufacturer = manufacturer_element.text.strip().replace("Marque :", "")
            else:
                manufacturer = "N/A"
        except:
            manufacturer = "N/A"

        # Scrape whether the item is new or used
        try:
            item_is_new = soup.find('span', id='availabilityCondition').text.strip()
        except:
            item_is_new = 'N/A'

        # Scrape the price
        try:
            price = soup.find('div', id='priceContainer').text.strip()
        except:
            price = 'N/A'

        # Scrape the shipping cost
        try:
            shipping_cost = soup.find('div', id='shippingsContainer').find('b').text.strip()
        except:
            shipping_cost = 'N/A'
            
        # Scrape product description
        try:
            product_description = soup.select_one('div#contall div#body_container div#body_container_in div#PAGE div#Columns div#Description').text.strip()
            # Remove '\n' and '\xa0'
            product_description = product_description.replace('\n', ' ').replace('\xa0', ' ')
        except:
            product_description = 'N/A'

        # Add the scraped data to the list
        data.append({
            'product_name': product_name,
            'product_link': product_url,
            'manufacturer': manufacturer,
            'is_new': item_is_new,
            'price': price,
            'shipping_cost': shipping_cost,
            'product_description': product_description
        })

        # Wait for a short time to avoid getting blocked
        time.sleep(1)

    # Increment the page number
    page_number += 1

# Convert the list of dictionaries to a pandas DataFrame and save it to a CSV file
df = pd.DataFrame(data)

df

Unnamed: 0,product_name,product_link,manufacturer,is_new,price,shipping_cost,product_description
0,Tir avec Cartouches 22LR ELEY SPORT boite de...,https://www.naturabuy.fr/Tir-avec-Cartouches-2...,Eley,Neuf,"12,90 €","4,95 €",Partager Vendre le même ...
1,Cartouches 22LR ELEY SPORT boite de 500 - Mu...,https://www.naturabuy.fr/Cartouches-22LR-ELEY-...,Eley,Neuf,"115,50 €","13,75 €",Partager Vendre le même ...
2,Cartouches 22LR ELEY SPORT boite de 50 - Mun...,https://www.naturabuy.fr/Cartouches-22LR-ELEY-...,Eley,Neuf,"12,90 €","4,95 €",Partager Vendre le même ...
3,Tirez avec Boites de 500 Cartouches 22LR ELEY ...,https://www.naturabuy.fr/Tir-avec-Boites-500-C...,Eley,Neuf,"115,50 €","13,95 €",Partager Vendre le même ...
4,Cartouches 22LR ELEY MATCH boite de 50 - Mun...,https://www.naturabuy.fr/Magnifique-HORLOGE-mo...,Eley,Neuf,"18,30 €","4,95 €",Partager Vendre le même ...
5,Munitions RWS R50 Ultra - Cal. 22 LR - 2.6 g /...,https://www.naturabuy.fr/Munitions-RWS-Cal-22-...,RWS,Neuf,"25,70 €","12,90 €",Partager Vendre le même ...
6,Cartouches RWS Biathlon Compétition - Cal. 22 ...,https://www.naturabuy.fr/Cartouches-RWS-Cal-22...,RWS,Neuf,"28,00 €","9,90 €",Partager Vendre le même ...
7,Cartouches RWS Biathlon Special Match - Cal. 2...,https://www.naturabuy.fr/Cartouches-RWS-Cal-22...,RWS,Neuf,"16,80 €","9,90 €",Partager Vendre le même ...
8,Cartouches RWS Green - Cal. 22 LR HV - 1.6 g /...,https://www.naturabuy.fr/Cartouches-RWS-Green-...,RWS,Neuf,"13,40 €","9,90 €",Partager Vendre le même ...
9,Cartouches RWS Green Fragm - Cal. 22 LR HV - 1...,https://www.naturabuy.fr/Cartouches-RWS-Green-...,RWS,Neuf,"14,50 €","9,90 €",Partager Vendre le même ...


In [None]:
df.product_link[0]

In [None]:
df.product_description[0]

In [None]:
# print data types of columns before inference
print(df.dtypes)

# infer data types of columns
df = df.infer_objects()

# print data types of columns after inference
print(df.dtypes)

In [3]:
# change dtypes of columns for easier manipulation
df['product_name'] = df['product_name'].astype(str)
df['manufacturer'] = df['manufacturer'].astype(str)
df['is_new'] = df['is_new'].astype(str)
df['price'] = df['price'].astype(str)
df['shipping_cost'] = df['shipping_cost'].astype(str)

# change formatting of prices, remove currency, set as float
df['price'] = df['price'].str.replace(',', '.').str.extract('(\d+\.\d+)', expand=False).astype(float)
df['shipping_cost'] = df['shipping_cost'].str.replace(',', '.').str.extract('(\d+\.\d+)', expand=False).fillna(0).astype(float)

df['total_price'] = df['price'] + df['shipping_cost']

# Convert TotalPrice column back to string with euro symbol and commas
#df['TotalPrice'] = df['TotalPrice'].apply(lambda x: '{:,.2f} €'.format(x))

# remove text from description that doesnt belong to the item itself, eg share buttons and shop category
df['product_description'] = df['product_description'].apply(lambda x: x.split("Flobert > Munitions - Balles 22LR")[1].strip())

In [None]:
# df["QttyAmmo"] - > regex function to run over ProductName col. Also check product_description
# df["Cost_per_round"] = df["TotalPrice"] / df["QttyAmmo"]

In [None]:
# to do
# add scrape target - qtty of rounds. DONE
# cost per shot DONE
# product link DONE

In [7]:
df.product_link[3]

'https://www.naturabuy.fr/Tir-avec-Boites-500-Cartouches-22LR-ELEY-CLUB-item-2649734.html'

In [13]:
# regex to catch any number divisible by 50 (min qtty of rounds in a box of ammo)
def extract_bullet_qtty(text):
    # match any number that is divisible by 50 without remainder
    regex = r"\b(0|[5-9]\d*[0]|100)\s*(?:boites de\s*)?(?:cartouches|balles|munitions)\b"
    match = re.search(regex, text, re.IGNORECASE)
    if match:
        # extract the matched number and convert it to integer
        qtty = int(match.group(1))
        # round the quantity to the nearest 50
        qtty = (qtty // 50) * 50
        return qtty
    else:
        return None

# check titles with regex
df['bullet_qtty'] = df['product_name'].apply(extract_bullet_qtty)
# check for empty cells, if any do a pass of regex on product description --- !!! DUPE avoidance !!!
df.loc[df['bullet_qtty'].isnull(), 'bullet_qtty'] = df['product_description'].apply(extract_bullet_qtty)

# if still no data, we fill with 50 for default min number of ammo per box
df['bullet_qtty'].fillna(50, inplace=True)

In [14]:
df.sort_values('bullet_qtty', ascending=False)

Unnamed: 0,product_name,product_link,manufacturer,is_new,price,shipping_cost,product_description,total_price,bullet_qtty
0,Tir avec Cartouches 22LR ELEY SPORT boite de...,https://www.naturabuy.fr/Tir-avec-Cartouches-2...,Eley,Neuf,12.9,4.95,Marque : EleyEtat de l'objet : NeufType : Stan...,17.85,50.0
1,Cartouches 22LR ELEY SPORT boite de 500 - Mu...,https://www.naturabuy.fr/Cartouches-22LR-ELEY-...,Eley,Neuf,115.5,13.75,Marque : EleyEtat de l'objet : Neuf Car...,129.25,500.0
2,Cartouches 22LR ELEY SPORT boite de 50 - Mun...,https://www.naturabuy.fr/Cartouches-22LR-ELEY-...,Eley,Neuf,12.9,4.95,Marque : EleyEtat de l'objet : Neuf - Ca...,17.85,50.0
3,Tirez avec Boites de 500 Cartouches 22LR ELEY ...,https://www.naturabuy.fr/Tir-avec-Boites-500-C...,Eley,Neuf,115.5,13.95,Marque : EleyEtat de l'objet : NeufType : Stan...,129.45,500.0
4,Cartouches 22LR ELEY MATCH boite de 50 - Mun...,https://www.naturabuy.fr/Magnifique-HORLOGE-mo...,Eley,Neuf,18.3,4.95,Marque : EleyEtat de l'objet : NeufType : Matc...,23.25,50.0
5,Munitions RWS R50 Ultra - Cal. 22 LR - 2.6 g /...,https://www.naturabuy.fr/Munitions-RWS-Cal-22-...,RWS,Neuf,25.7,12.9,Marque : RWSEtat de l'objet : NeufType : Stand...,38.6,50.0
6,Cartouches RWS Biathlon Compétition - Cal. 22 ...,https://www.naturabuy.fr/Cartouches-RWS-Cal-22...,RWS,Neuf,28.0,9.9,Marque : RWSEtat de l'objet : NeufType : Match...,37.9,50.0
7,Cartouches RWS Biathlon Special Match - Cal. 2...,https://www.naturabuy.fr/Cartouches-RWS-Cal-22...,RWS,Neuf,16.8,9.9,Marque : RWSEtat de l'objet : NeufType : Match...,26.7,50.0
8,Cartouches RWS Green - Cal. 22 LR HV - 1.6 g /...,https://www.naturabuy.fr/Cartouches-RWS-Green-...,RWS,Neuf,13.4,9.9,Marque : RWSEtat de l'objet : NeufType : Stand...,23.3,50.0
9,Cartouches RWS Green Fragm - Cal. 22 LR HV - 1...,https://www.naturabuy.fr/Cartouches-RWS-Green-...,RWS,Neuf,14.5,9.9,Marque : RWSEtat de l'objet : NeufType : Stand...,24.4,50.0


In [17]:
df["price_per_bullet"] = df["total_price"] / df["bullet_qtty"]

In [18]:
df

Unnamed: 0,product_name,product_link,manufacturer,is_new,price,shipping_cost,product_description,total_price,bullet_qtty,Price_per_bullet
0,Tir avec Cartouches 22LR ELEY SPORT boite de...,https://www.naturabuy.fr/Tir-avec-Cartouches-2...,Eley,Neuf,12.9,4.95,Marque : EleyEtat de l'objet : NeufType : Stan...,17.85,50.0,0.357
1,Cartouches 22LR ELEY SPORT boite de 500 - Mu...,https://www.naturabuy.fr/Cartouches-22LR-ELEY-...,Eley,Neuf,115.5,13.75,Marque : EleyEtat de l'objet : Neuf Car...,129.25,500.0,0.2585
2,Cartouches 22LR ELEY SPORT boite de 50 - Mun...,https://www.naturabuy.fr/Cartouches-22LR-ELEY-...,Eley,Neuf,12.9,4.95,Marque : EleyEtat de l'objet : Neuf - Ca...,17.85,50.0,0.357
3,Tirez avec Boites de 500 Cartouches 22LR ELEY ...,https://www.naturabuy.fr/Tir-avec-Boites-500-C...,Eley,Neuf,115.5,13.95,Marque : EleyEtat de l'objet : NeufType : Stan...,129.45,500.0,0.2589
4,Cartouches 22LR ELEY MATCH boite de 50 - Mun...,https://www.naturabuy.fr/Magnifique-HORLOGE-mo...,Eley,Neuf,18.3,4.95,Marque : EleyEtat de l'objet : NeufType : Matc...,23.25,50.0,0.465
5,Munitions RWS R50 Ultra - Cal. 22 LR - 2.6 g /...,https://www.naturabuy.fr/Munitions-RWS-Cal-22-...,RWS,Neuf,25.7,12.9,Marque : RWSEtat de l'objet : NeufType : Stand...,38.6,50.0,0.772
6,Cartouches RWS Biathlon Compétition - Cal. 22 ...,https://www.naturabuy.fr/Cartouches-RWS-Cal-22...,RWS,Neuf,28.0,9.9,Marque : RWSEtat de l'objet : NeufType : Match...,37.9,50.0,0.758
7,Cartouches RWS Biathlon Special Match - Cal. 2...,https://www.naturabuy.fr/Cartouches-RWS-Cal-22...,RWS,Neuf,16.8,9.9,Marque : RWSEtat de l'objet : NeufType : Match...,26.7,50.0,0.534
8,Cartouches RWS Green - Cal. 22 LR HV - 1.6 g /...,https://www.naturabuy.fr/Cartouches-RWS-Green-...,RWS,Neuf,13.4,9.9,Marque : RWSEtat de l'objet : NeufType : Stand...,23.3,50.0,0.466
9,Cartouches RWS Green Fragm - Cal. 22 LR HV - 1...,https://www.naturabuy.fr/Cartouches-RWS-Green-...,RWS,Neuf,14.5,9.9,Marque : RWSEtat de l'objet : NeufType : Stand...,24.4,50.0,0.488


In [19]:
# Save updated DataFrame to CSV
df.to_csv('naturabuy_scraped_data.csv', index=False)

In [None]:
# to do
# change is_new col data to 0 and 1
# order of cols