In [1]:
# libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

import re

from tqdm import tqdm
import time

import sqlite3

import os
import random
import string

In [2]:
# PANDAS OPTIONS
# Set maximum number of columns and rows to display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set the maximum column width to a high value
pd.set_option('display.max_colwidth', 1000)

In [3]:
# read back raw data from csv
df = pd.read_csv('scraped_data.csv')

# change dtypes of columns for easier manipulation
df['product_name'] = df['product_name'].astype(str)
df['manufacturer'] = df['manufacturer'].astype(str)
df['is_new'] = df['is_new'].astype(str)
df['price'] = df['price'].astype(str)
df['shipping_cost'] = df['shipping_cost'].astype(str)

# change formatting of prices, remove currency, set as float
df['price'] = df['price'].str.replace(',', '.').str.extract('(\d+\.\d+)', expand=False).astype(float)
df['shipping_cost'] = df['shipping_cost'].str.replace(',', '.').str.extract('(\d+\.\d+)', expand=False).fillna(0).astype(float)

# change string values for new-used to binary
df["is_new"] = df["is_new"].map({"Neuf": 1, "Occasion": 0})

# add new column for Total price
df['total_price'] = df['price'] + df['shipping_cost']

# remove text from description that doesnt belong to the item itself, eg share buttons and shop category
df['product_description'] = df['product_description'].apply(lambda x: x.split("Flobert > Munitions - Balles 22LR")[1].strip())

#df.head(3)

In [5]:
df.head(2)

Unnamed: 0,product_name,product_link,manufacturer,is_new,price,shipping_cost,product_description,total_price
0,22LR RIFLE MATCH - Munitions - Balles 22LR (6770650),https://www.naturabuy.fr/22LR-RIFLE-MATCH-item-6770650.html,RWS,1,7.95,10.99,Marque : RWSEtat de l'objet : NeufType : Match Détails techniques : Loi : CMARQUE : RWSCALIBRE : 22LRDESIGNATION : RIFLE MATCHPOIDS D'OGIVE (GRS) : 40TYPE D'OGIVE : LRNCONDITIONNEMENT (PAR BOITE) : 50VITESSE A LA BOUCHE V0 (M/S) : 330,18.94
1,500 balles de 22 LONG RIFLE - Munitions - Balles 22LR (9659831),https://www.naturabuy.fr/500-balles-22-LONG-RIFLE-item-9659831.html,Aguila,1,85.0,15.0,"Marque : AguilaEtat de l'objet : Neuf AGUILA AGUILA Ammunition est un fabricant mexicain de cartouches pour armes à feu. Son origine remonte à 1961, ce fabricant dispose donc d’une longue expérience de près de 60 années. Sa production porte principalement sur le calibre .22 avec plusieurs modèles, ainsi que les calibres .17 et 12 Il y a une incontestable qualité dans la fabrication et les matériaux constituant ces cartouches de .22LR, les spécialistes connaissent bien cette marque quelque peu exotique et l’apprécient autant que les plus grandes marques mondiales, pour des tirs sur cible à 50 mètres très bien positionnés. Description Ces cartouches .22LR standard sont des munitions à percussion annulaire, disposant d’une ogive ne plomb de 40 grains (2,59 grammes) au profil de tête ronde. Avec leur vélocité de 330 m/s en sortie de bouche, on les utilise couramment et en quantités pour des tirs en club à 50 mètres avec des résultats qui satisfont les tireurs les plu...",100.0


In [6]:
# qtty of ammo search in TITLE

# Regular expression pattern for finding numbers divisible by 50 without remainder
pattern1 = r'\b(?:[5-9]|[1-9]\d{1,})0\b|\b525\b'

# Function to find matching numbers and store in new columns
def find_ammo_qtty_title(df):
    # Find all matching numbers and store them in a list
    matches = df['product_name'].apply(lambda x: re.findall(pattern1, str(x)))
    
    # Create new columns for each match and store the results
    for i, match in enumerate(matches):
        for j in range(len(match)):
            column_name = f'ammo_qtty_title_{j+1}'
            df.at[i, column_name] = match[j]
    
    return df


In [8]:
df_new = find_ammo_qtty_title(df)
df_new.head(1)

Unnamed: 0,product_name,product_link,manufacturer,is_new,price,shipping_cost,product_description,total_price,ammo_qtty_title_1,ammo_qtty_title_2,ammo_qtty_title_3,ammo_qtty_title_4
0,22LR RIFLE MATCH - Munitions - Balles 22LR (6770650),https://www.naturabuy.fr/22LR-RIFLE-MATCH-item-6770650.html,RWS,1,7.95,10.99,Marque : RWSEtat de l'objet : NeufType : Match Détails techniques : Loi : CMARQUE : RWSCALIBRE : 22LRDESIGNATION : RIFLE MATCHPOIDS D'OGIVE (GRS) : 40TYPE D'OGIVE : LRNCONDITIONNEMENT (PAR BOITE) : 50VITESSE A LA BOUCHE V0 (M/S) : 330,18.94,6770650,,,


In [7]:
df.head(1)

Unnamed: 0,product_name,product_link,manufacturer,is_new,price,shipping_cost,product_description,total_price
0,22LR RIFLE MATCH - Munitions - Balles 22LR (6770650),https://www.naturabuy.fr/22LR-RIFLE-MATCH-item-6770650.html,RWS,1,7.95,10.99,Marque : RWSEtat de l'objet : NeufType : Match Détails techniques : Loi : CMARQUE : RWSCALIBRE : 22LRDESIGNATION : RIFLE MATCHPOIDS D'OGIVE (GRS) : 40TYPE D'OGIVE : LRNCONDITIONNEMENT (PAR BOITE) : 50VITESSE A LA BOUCHE V0 (M/S) : 330,18.94


In [None]:
# qtty of ammo search in DESCRIPTION

# Regular expression pattern for finding numbers divisible by 50 without remainder
pattern2 = r'\b(?:[5-9]|[1-9]\d{1,})0\b|\b525\b'

# Function to find matching numbers and store in new columns
def find_ammo_qtty_description(df):
    # Find all matching numbers and store them in a list
    matches = df['product_description'].apply(lambda x: re.findall(pattern2, str(x)))
    
    # Create new columns for each match and store the results
    for i, match in enumerate(matches):
        for j in range(len(match)):
            column_name = f'ammo_qtty_description_{j+1}'
            df.at[i, column_name] = match[j]
    
    return df


In [None]:
# qtty of boxes search

# Regular expression pattern for finding numbers before/after French words that mean "box" or "boxes"
pattern3 = r'\b(\d+)\s*(?:(?:LOT|PAQUET)\s+DE\s+)?(?:(\d+)\s*(?:BOITE|BOITES))\b|\b(?:PAR\s+(\d+))\b'

# Function to find matching numbers and store in new columns
def find_box_numbers(df):
    # Find all matching numbers and store them in a list
    matches_name = df['product_name'].apply(lambda x: re.findall(pattern3, str(x)))
    matches_desc = df['product_description'].apply(lambda x: re.findall(pattern3, str(x)))
    matches = matches_name + matches_desc
    
    # Create new columns for each match and store the results
    for i, match in enumerate(matches):
        for j in range(len(match)):
            column_name = f'box_qty_{j+1}'
            df.at[i, column_name] = ''.join(match[j])
    
    return df
