In [1]:
# libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

import re

from tqdm import tqdm
import time

import sqlite3

In [2]:
# PANDAS OPTIONS
# Set maximum number of columns and rows to display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set the maximum column width to a high value
pd.set_option('display.max_colwidth', 1000)

In [3]:
# Define the URL to scrape
base_url = 'https://www.naturabuy.fr/Munitions-Balles-22LR-cat-884.html'
page_number = 1

# Connect to the SQLite database
conn = sqlite3.connect('scraped_data.db')
c = conn.cursor()

# Create a table to store the scraped data
c.execute('''
CREATE TABLE IF NOT EXISTS products (
    product_name TEXT,
    product_link TEXT,
    manufacturer TEXT,
    is_new BOOLEAN,
    price FLOAT,
    shipping_cost FLOAT,
    product_description TEXT
)
''')
conn.commit()

# Loop through all pages of the website
while True:

    # Construct the URL for the current page
    url = base_url + f'?PAGE={page_number}'

    # Make a GET request to the URL
    response = requests.get(url)
    
    # Parse the HTML content of the response using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the item cards on the page
    cards = soup.find_all('a', class_='itemcard')

    # If no cards are found, break out of the loop
    if not cards:
        break
        
    # Loop through the item cards and scrape the information
    for card in tqdm(cards):

        # Get the href attribute of the item card and construct the URL for the product page
        product_url = 'https://www.naturabuy.fr/' + card['href'].lstrip('/')

        # Make a GET request to the product page
        response = requests.get(product_url)

        # Parse the HTML content of the response using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Scrape the product name from the title tag
        try:
            product_name = soup.find('title').text.strip()
        except:
            product_name = 'N/A'
            
        # Scrape the manufacturer
        try:
            manufacturer_element = soup.select_one("html:-soup-contains('Marque :') body div#contall div#body_container div#body_container_in div#PAGE div#Columns div#mainProduct div#productWrapper div#blocGallery div#productCriteres div.critere div.criterevalue")
            if manufacturer_element:
                manufacturer = manufacturer_element.text.strip().replace("Marque :", "")
            else:
                manufacturer = "N/A"
        except:
            manufacturer = "N/A"
     
        # Scrape whether the item is new or used
        try:
            item_is_new = soup.find('span', id='availabilityCondition').text.strip()
        except:
            item_is_new = 'N/A'

        # Scrape the price
        try:
            price = soup.find('div', id='priceContainer').text.strip()
        except:
            price = 'N/A'

        # Scrape the shipping cost
        try:
            shipping_cost = soup.find('div', id='shippingsContainer').find('b').text.strip()
        except:
            shipping_cost = 'N/A'
            
        # Scrape product description
        try:
            product_description = soup.select_one('div#contall div#body_container div#body_container_in div#PAGE div#Columns div#Description').text.strip()
            # Remove '\n' and '\xa0'
            product_description = product_description.replace('\n', ' ').replace('\xa0', ' ')
        except:
            product_description = 'N/A'

        # Insert the scraped data into the database
        c.execute('''
        INSERT INTO products (product_name, product_link, manufacturer, is_new, price, shipping_cost, product_description)
        VALUES (?, ?, ?, ?, ?, ?, ?)
        ''', (product_name, product_url, manufacturer, item_is_new, price, shipping_cost, product_description))
        conn.commit()

        # Wait for a short time to avoid getting blocked
        time.sleep(1)
        
    # check amount of cards on page, less than 60 leads to end of loop --- last page will have less than max amount of cards
    # might be only way for naturabuy site
    if len(cards) < 60:
        break

    # Increment the page number
    page_number += 1

# Close the database connection
conn.close()

# Read the data from the database into a pandas DataFrame and save it to a CSV file
conn = sqlite3.connect('scraped_data.db')
df = pd.read_sql_query('SELECT * FROM products', conn)
df.to_csv('scraped_data.csv', index=False)
conn.close()

100%|██████████| 61/61 [01:18<00:00,  1.29s/it]
100%|██████████| 61/61 [01:18<00:00,  1.29s/it]
100%|██████████| 61/61 [01:19<00:00,  1.31s/it]
100%|██████████| 61/61 [01:18<00:00,  1.29s/it]
100%|██████████| 61/61 [01:19<00:00,  1.30s/it]
100%|██████████| 61/61 [01:19<00:00,  1.30s/it]
100%|██████████| 61/61 [01:20<00:00,  1.31s/it]
100%|██████████| 61/61 [01:20<00:00,  1.31s/it]
100%|██████████| 61/61 [01:19<00:00,  1.31s/it]
100%|██████████| 61/61 [01:19<00:00,  1.30s/it]
100%|██████████| 61/61 [01:19<00:00,  1.30s/it]
100%|██████████| 61/61 [01:20<00:00,  1.32s/it]
100%|██████████| 61/61 [01:19<00:00,  1.30s/it]
100%|██████████| 61/61 [01:20<00:00,  1.31s/it]
100%|██████████| 61/61 [01:19<00:00,  1.30s/it]
100%|██████████| 61/61 [01:19<00:00,  1.31s/it]
100%|██████████| 61/61 [01:19<00:00,  1.30s/it]
100%|██████████| 61/61 [01:19<00:00,  1.30s/it]
100%|██████████| 38/38 [00:50<00:00,  1.32s/it]


In [None]:
conn = sqlite3.connect('scraped_data.db')
df = pd.read_sql_query('SELECT * FROM products', conn)
df.to_csv('scraped_data.csv', index=False)
conn.close()

In [None]:
# Define the URL to scrape
base_url = 'https://www.naturabuy.fr/Munitions-Balles-22LR-cat-884.html'
page_number = 1

# Create an empty list to store the scraped data
data = []

# Loop through all pages of the website
while True:

    # Construct the URL for the current page
    url = base_url + f'?PAGE={page_number}'

    # Make a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the response using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the item cards on the page
    cards = soup.find_all('a', class_='itemcard')

    # If no cards are found, break out of the loop
    if not cards:
        break

    # Loop through the item cards and scrape the information
    for card in cards:

        # Get the href attribute of the item card and construct the URL for the product page
        product_url = 'https://www.naturabuy.fr/' + card['href'].lstrip('/')

        # Make a GET request to the product page
        response = requests.get(product_url)

        # Parse the HTML content of the response using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        ##### Scrape the product name from the title tag
        try:
            product_name = soup.find('title').text.strip()
        except:
            product_name = 'N/A'
            
       # Scrape the manufacturer
        try:
            manufacturer_element = soup.select_one("html:-soup-contains('Marque :') body div#contall div#body_container div#body_container_in div#PAGE div#Columns div#mainProduct div#productWrapper div#blocGallery div#productCriteres div.critere div.criterevalue")
            if manufacturer_element:
                manufacturer = manufacturer_element.text.strip().replace("Marque :", "")
            else:
                manufacturer = "N/A"
        except:
            manufacturer = "N/A"
     
        # Scrape whether the item is new or used
        try:
            item_is_new = soup.find('span', id='availabilityCondition').text.strip()
        except:
            item_is_new = 'N/A'

        # Scrape the price
        try:
            price = soup.find('div', id='priceContainer').text.strip()
        except:
            price = 'N/A'

        # Scrape the shipping cost
        try:
            shipping_cost = soup.find('div', id='shippingsContainer').find('b').text.strip()
        except:
            shipping_cost = 'N/A'
            
        # Scrape product description
        try:
            product_description = soup.select_one('div#contall div#body_container div#body_container_in div#PAGE div#Columns div#Description').text.strip()
            # Remove '\n' and '\xa0'
            product_description = product_description.replace('\n', ' ').replace('\xa0', ' ')
        except:
            product_description = 'N/A'

        # Add the scraped data to the list
        data.append({
            'product_name': product_name,
            'product_link': product_url,
            'manufacturer': manufacturer,
            'is_new': item_is_new,
            'price': price,
            'shipping_cost': shipping_cost,
            'product_description': product_description
        })

        # Wait for a short time to avoid getting blocked
        time.sleep(1)

    # Increment the page number
    page_number += 1

# Convert the list of dictionaries to a pandas DataFrame and save it to a CSV file
df = pd.DataFrame(data)

#df

In [None]:
# Define the URL to scrape
base_url = 'https://www.naturabuy.fr/Munitions-Balles-22LR-cat-884.html'
page_number = 1

# Create an empty list to store the scraped data
data = []

# Loop through the first two pages of the website
while page_number <= 2:

    # Construct the URL for the current page
    url = base_url + f'?PAGE={page_number}'

    # Make a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the response using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the item cards on the page
    cards = soup.find_all('a', class_='itemcard')

    # Loop through the item cards and scrape the information
    for card in cards[:5]:

        # Get the href attribute of the item card and construct the URL for the product page
        product_url = 'https://www.naturabuy.fr/' + card['href'].lstrip('/')

        # Make a GET request to the product page
        response = requests.get(product_url)

        # Parse the HTML content of the response using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        ##### Scrape the product name from the title tag
        try:
            product_name = soup.find('title').text.strip()
        except:
            product_name = 'N/A'
            
       # Scrape the manufacturer
        try:
            manufacturer_element = soup.select_one("html:-soup-contains('Marque :') body div#contall div#body_container div#body_container_in div#PAGE div#Columns div#mainProduct div#productWrapper div#blocGallery div#productCriteres div.critere div.criterevalue")
            if manufacturer_element:
                manufacturer = manufacturer_element.text.strip().replace("Marque :", "")
            else:
                manufacturer = "N/A"
        except:
            manufacturer = "N/A"
     
        # Scrape whether the item is new or used
        try:
            item_is_new = soup.find('span', id='availabilityCondition').text.strip()
        except:
            item_is_new = 'N/A'

        # Scrape the price
        try:
            price = soup.find('div', id='priceContainer').text.strip()
        except:
            price = 'N/A'

        # Scrape the shipping cost
        try:
            shipping_cost = soup.find('div', id='shippingsContainer').find('b').text.strip()
        except:
            shipping_cost = 'N/A'
            
        # Scrape product description
        try:
            product_description = soup.select_one('div#contall div#body_container div#body_container_in div#PAGE div#Columns div#Description').text.strip()
            # Remove '\n' and '\xa0'
            product_description = product_description.replace('\n', ' ').replace('\xa0', ' ')
        except:
            product_description = 'N/A'

        # Add the scraped data to the list
        data.append({
            'product_name': product_name,
            'product_link': product_url,
            'manufacturer': manufacturer,
            'is_new': item_is_new,
            'price': price,
            'shipping_cost': shipping_cost,
            'product_description': product_description
        })

        # Wait for a short time to avoid getting blocked
        time.sleep(1)

    # Increment the page number
    page_number += 1

# Convert the list of dictionaries to a pandas DataFrame and save it to a CSV file
df = pd.DataFrame(data)

#df

In [None]:
# change dtypes of columns for easier manipulation
df['product_name'] = df['product_name'].astype(str)
df['manufacturer'] = df['manufacturer'].astype(str)
df['is_new'] = df['is_new'].astype(str)
df['price'] = df['price'].astype(str)
df['shipping_cost'] = df['shipping_cost'].astype(str)

# change formatting of prices, remove currency, set as float
df['price'] = df['price'].str.replace(',', '.').str.extract('(\d+\.\d+)', expand=False).astype(float)
df['shipping_cost'] = df['shipping_cost'].str.replace(',', '.').str.extract('(\d+\.\d+)', expand=False).fillna(0).astype(float)

# change string values for new-used to binary
df["is_new"] = df["is_new"].map({"Neuf": 1, "Occasion": 0})

# add new column for Total price
df['total_price'] = df['price'] + df['shipping_cost']

# remove text from description that doesnt belong to the item itself, eg share buttons and shop category
df['product_description'] = df['product_description'].apply(lambda x: x.split("Flobert > Munitions - Balles 22LR")[1].strip())

#df

In [None]:
# build a list of 22LR ammo manufacturers

# manually built list instead of dynamically scraping each site.
# Website-agnostic approach. Increase in speed and decrease in scraping load.
# Missing brands can be found in df.manufacturer and entered here.

list_manufacturers = [
    'Aguila Ammunition',  # aguila is same as aquila
    'Aquila',  # aguila is same as aquila
    'American Eagle',
    'Armscor',
    'Australian Outback Ammo',
    'Barnaul',
    'Blaser',
    'Blazer',
    'Browning',
    'Cartoucherie Française',
    'CCI',
    'CBC',
    'Divers',
    'Eley',
    'ELD Performance',
    'Federal',  # Federal Premium and Federal are the same
    'Fiocchi',
    'Flobert',
    'Geco',
    'Gemtech',
    'Gevelot',
    'Golden Eagle',
    'Hornady',
    'Lapua',
    'Les Baer Custom',
    'Lot Diverses Marques',
    'Magtech',
    'Manufrance',
    'Mauser',
    'MaxxTech',
    'NCS',
    'Norma',
    'PMC',
    'PPU',
    'Rangemaster',
    'Remington',
    'RWS',
    'Sellier and Bellot',  # Sellier & Bellot and Sellier and Bellot are the same
    'SFM',
    'SK',
    'Solognac',
    'Spartan',
    'Speer',
    'Topshot',
    'Victory',
    'Winchester',
    'Wolf'
]

# function to search for manufacturer name in text using regex
def search_manufacturer(text):
    #pattern = '|'.join(list_manufacturers)
    pattern = '|'.join([re.escape(x) for x in list_manufacturers])
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group()
    else:
        return None

# apply search_manufacturer function to the product_name column
df['manufacturer'] = df.apply(lambda x: search_manufacturer(x['product_name']) if pd.isna(x['manufacturer']) or x['manufacturer'] == 'N/A' else x['manufacturer'], axis=1)

In [None]:
#### create a regex pattern to match manufacturer names from the list
###manufacturers_pattern = re.compile(r"\b(" + "|".join(list_manufacturers) + r")\b")
###
#### extract manufacturer from product name or description
###def extract_manufacturer(text):
###    # try to extract from product name
###    match = manufacturers_pattern.search(text)
###    if match:
###        return match.group(1)
###    # if not found, try to extract from product description
###    else:
###        match = manufacturers_pattern.search(df.loc[df['product_name']==text, 'product_description'].values[0])
###        if match:
###            return match.group(1)
###        # if still not found, return None
###        else:
###            return None
###
#### apply function to extract manufacturer from product name or description
###df['manufacturer'] = df['product_name'].apply(extract_manufacturer)
###
#### check for empty cells, if any do a pass of regex on product description
###df.loc[df['manufacturer'].isnull(), 'manufacturer'] = df['product_description'].apply(extract_manufacturer)
###
#### if still no data, we fill with N/A
###df['manufacturer'].fillna('N/A', inplace=True)

In [None]:
## regex to catch any number divisible by 50 (min qtty of rounds in a box of ammo)
#def extract_bullet_qtty(text):
#    # match any number that is divisible by 50 without remainder
#    regex = r"\b(0|[5-9]\d*[0]|100)\s*(?:boites de\s*)?(?:cartouches|balles|munitions)\b"
#    match = re.search(regex, text, re.IGNORECASE)
#    if match:
#        # extract the matched number and convert it to integer
#        qtty = int(match.group(1))
#        # round the quantity to the nearest 50
#        qtty = (qtty // 50) * 50
#        return qtty
#    else:
#        return None
#
## check titles with regex
#df['bullet_qtty'] = df['product_name'].apply(extract_bullet_qtty)
#
## check for empty cells, if any do a pass of regex on product description --- !!! DUPE avoidance !!!
#df.loc[df['bullet_qtty'].isnull(), 'bullet_qtty'] = df['product_description'].apply(extract_bullet_qtty)
#
## if still no data, we fill with 50 for default min number of ammo per box
#df['bullet_qtty'].fillna(50, inplace=True)

In [None]:
# regex to catch any number divisible by 50 (min qtty of rounds in a box of ammo)
def extract_bullet_qtty(text):
    # match any number that is divisible by 50 without remainder
    regex = r"\b(0|[5-9]\d*[0]|100)\s*(?:boites de\s*)?(?:cartouches|balles|munitions)\b|\bMunition \/ boite\s*:\s*(0|[5-9]\d*[0]|100)\b"
    match = re.search(regex, text, re.IGNORECASE)
    if match:
        # extract the matched number and convert it to integer
        qtty = int(match.group(1)) if match.group(1) else int(match.group(2))
        # round the quantity to the nearest 50
        qtty = (qtty // 50) * 50
        return qtty
    else:
        return None

# check titles with regex
df['bullet_qtty'] = df['product_name'].apply(extract_bullet_qtty)

# check for empty cells, if any do a pass of regex on product description --- !!! DUPE avoidance !!!
df.loc[df['bullet_qtty'].isnull(), df.columns[df.columns.get_loc('bullet_qtty')]] = df['product_description'].apply(extract_bullet_qtty)

# if still no data, we fill with 50 for default min number of ammo per box
df['bullet_qtty'].fillna(50, inplace=True)

#df.sort_values('bullet_qtty', ascending=False)

In [None]:
# calculate cost of individual bullet from all data
df["price_per_bullet"] = df["total_price"] / df["bullet_qtty"]

df

In [None]:
# Save updated DataFrame to CSV
df.to_csv('naturabuy_scraped_data.csv', index=False)

In [None]:

# to do
# add scrape target - qtty of rounds. DONE
# cost per shot DONE
# product link DONE
# change is_new col data to 0 and 1 DONE

# df["QttyAmmo"] - > regex function to run over ProductName col. Also check product_description
# df["Cost_per_round"] = df["TotalPrice"] / df["QttyAmmo"]

# order of cols

#add to price selector:
#REGEX pattern - Munition / boite : 50