In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [2]:
url = "https://www.aldi.com.au/groceries/price-reductions/"

In [3]:
HEADERS = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

In [4]:
# get page data
page = requests.get(url, headers=HEADERS)

In [5]:
soup = BeautifulSoup(page.content, 'html.parser')


In [None]:
#print(soup.prettify())

In [93]:
# Find all items containing the data
items = soup.find_all('div', class_='box')

# Prepare lists to store data
descriptions = []
values = []
images = []

# Loop through items and extract data
for item in items:
    # Extract description
    description = item.find('div', class_='box--description--header')
    if description:
        description_text = description.get_text(separator=' ', strip=True)  # Add space between elements
    else:
        description_text = None

    # Extract value and decimal
    value = item.find('span', class_='box--value')
    decimal = item.find('span', class_='box--decimal')
    value_text = f"{value.get_text(strip=True)}.{decimal.get_text(strip=True)}" if value and decimal else None

    # Extract image URL
    image_tag = item.find('img')
    if image_tag and image_tag.has_attr('src'):
        image_url = image_tag['src']
    else:
        image_url = None

    # Append data to lists
    descriptions.append(description_text)
    values.append(value_text)
    images.append(image_url)

# Create a pandas DataFrame
data = pd.DataFrame({
    'Description': descriptions,
    'Price': values,
    'Image URL': images
})

# Print or save the DataFrame
print(data)

                                           Description    Price  \
0                                                 None     None   
1    Ready, Set… Cook! Marinated Whole RSPCA Approv...   $5..99   
2                 Ready, Set… Cook! Thin Sausages 500g   $3..69   
3                 Ready, Set… Cook! Pork Sausages 500g   $3..69   
4    Ready, Set… Cook! Beef Sausages Honey Flavour ...   $3..69   
..                                                 ...      ...   
157            Silvesters Cat Food Meat Varieties 400g   $1..49   
158            Silvesters Cat Food Fish Varieties 400g   $1..49   
159   Julius Dog Food Casserole with Beef & Gravy 700g   $2..29   
160         Julius Dry Dog Food Beef and Vegetable 8kg  $17..49   
161                         Supercoat Dry Dog Food 6kg  $24..99   

                                             Image URL  
0    https://www.aldi.com.au/fileadmin/_processed_/...  
1    https://www.aldi.com.au/fileadmin/_processed_/...  
2    https://www.aldi.co

In [94]:
# Function to extract 'amount' from the end of the 'Description'
def extract_amount(description):
    if description:  # Check if description is not None
        # Check if 'per kg' is in the description
        if 'per kg' in description.lower():
            return '1kg'  # If 'per kg' found, return '1kg'

        # Regex to match valid numeric amounts followed by valid units (g, kg, pk, L, ml)
        match = re.search(r'(\d+\s?(g|kg|pk|L|ml))(?:[-/x]?\s?\d*\s?(g|kg|pk|L|ml))*$', description)
        if match:
            return match.group(0)  # Return the matched amount
    return None  # Return None if no valid match is found

# Apply function to create 'Amount' column
data['Amount'] = data['Description'].apply(extract_amount)

# Function to remove Amount from Description
def remove_amount_from_description(row):
    if row['Amount']:
        # Regex to remove the amount (e.g., '500g', '1kg') from Description
        row['Description'] = row['Description'].replace(' per kg', '')
        return re.sub(r'\s?' + re.escape(row['Amount']), '', row['Description'])
    return row['Description']

# Apply function to remove Amount from Description
data['Description'] = data.apply(remove_amount_from_description, axis=1)

# Print or save the cleaned DataFrame
print(data)



                                           Description    Price  \
0                                                 None     None   
1    Ready, Set… Cook! Marinated Whole RSPCA Approv...   $5..99   
2                      Ready, Set… Cook! Thin Sausages   $3..69   
3                      Ready, Set… Cook! Pork Sausages   $3..69   
4        Ready, Set… Cook! Beef Sausages Honey Flavour   $3..69   
..                                                 ...      ...   
157                 Silvesters Cat Food Meat Varieties   $1..49   
158                 Silvesters Cat Food Fish Varieties   $1..49   
159        Julius Dog Food Casserole with Beef & Gravy   $2..29   
160             Julius Dry Dog Food Beef and Vegetable  $17..49   
161                             Supercoat Dry Dog Food  $24..99   

                                             Image URL Amount  
0    https://www.aldi.com.au/fileadmin/_processed_/...   None  
1    https://www.aldi.com.au/fileadmin/_processed_/...    1kg  
2  

In [95]:
# Function to clean the Price column
def clean_price(price):
    if price:  # Check if price is not None
        # Replace dot with 'c' if there's no number after the dot
        if re.search(r'\.\s*$', price):  # Check if the dot is at the end without numbers after it
            return price.replace('.', 'c')
        # Remove all dots except the first one
        return re.sub(r'\.(?=.*\.)', '', price)
    return price

# Apply the clean_price function to the Price column
data['Price'] = data['Price'].apply(clean_price)


In [99]:
# Remove rows where 'Description' is None
data = data.dropna(subset=['Description'])

# Reorder columns: Move 'Amount' before 'Price'
data = data[['Description', 'Amount', 'Price', 'Image URL']]

# Remove rows where all columns are None
data = data.dropna(how='all')

# Print or save the cleaned DataFrame
print(data)


                                           Description Amount   Price  \
1    Ready, Set… Cook! Marinated Whole RSPCA Approv...    1kg   $5.99   
2                      Ready, Set… Cook! Thin Sausages   500g   $3.69   
3                      Ready, Set… Cook! Pork Sausages   500g   $3.69   
4        Ready, Set… Cook! Beef Sausages Honey Flavour   500g   $3.69   
5                   Jindurra Station Beef Sizzle Steak    1kg  $22.99   
..                                                 ...    ...     ...   
157                 Silvesters Cat Food Meat Varieties   400g   $1.49   
158                 Silvesters Cat Food Fish Varieties   400g   $1.49   
159        Julius Dog Food Casserole with Beef & Gravy   700g   $2.29   
160             Julius Dry Dog Food Beef and Vegetable    8kg  $17.49   
161                             Supercoat Dry Dog Food    6kg  $24.99   

                                             Image URL  
1    https://www.aldi.com.au/fileadmin/_processed_/...  
2    http

In [102]:
# Define categories and keywords
categories = {
    'Fruit & Veg': [
        'Bananas', 'Apples', 'Oranges', 'Grapes', 'Carrots', 'Potatoes', 'Sweet Potatoes',
        'Tomatoes', 'Cucumber', 'Broccoli', 'Cauliflower', 'Spinach', 'Lettuce', 'Peppers',
        'Zucchini', 'Mushrooms', 'Onions', 'Garlic', 'Strawberries', 'Blueberries', 'Avocado', 'Coconut',
        'Olives', 'Beans', 'Bean', 'Chickpeas', 'Peas', 'Lentils', 'Mango'
    ],
    'Bakery': [
        'Bread', 'Wholemeal Bread', 'Croissant', 'Cake', 'Chocolate Cake', 'Muffin', 'Bagel',
        'Donut', 'Baguette', 'Roll', 'Flatbread', 'Pita', 'Brioche', 'Shortbread',
        'Sourdough', 'Focaccia', 'Ciabatta', 'Pastry'
    ],
    'Poultry, Meat & Seafood': [
        'Chicken', 'Chicken Breast', 'Chicken Thigh', 'Beef', 'Steak', 'Pork', 'Lamb',
        'Turkey', 'Bacon', 'Ham', 'Sausage', 'Salami', 'Duck', 'Fish', 'Salmon', 'Tuna',
        'Shrimp', 'Prawns', 'Crab', 'Lobster', 'Cod', 'Haddock', 'Mussels', 'Stock'
    ],
    'Deli & Chilled Meals': [
        'Ham', 'Salami', 'Prosciutto', 'Pasta Salad', 'Quiche', 'Soup', 'Coleslaw',
        'Sandwich', 'Wrap', 'Sausage Roll', 'Ready Meals', 'Lasagna', 'Curry', 'Pizza',
        'Deli Chicken', 'Meat Platter', 'Cheese Platter'
    ],
    'Dairy, Eggs & Fridge': [
        'Milk', 'Almond Milk', 'Oat Milk', 'Soy Milk', 'Cheese', 'Cheddar Cheese', 'Mozzarella',
        'Butter', 'Eggs', 'Free-Range Eggs', 'Greek Yogurt', 'Yogurt', 'Cream', 'Whipping Cream',
        'Sour Cream', 'Cream Cheese', 'Custard'
    ],
    'Lunch Box': [
        'Juice Box', 'Snack Bar', 'Muesli Bar', 'Granola Bar', 'Crackers', 'Rice Crackers',
        'Fruit Cup', 'Cheese Stick', 'String Cheese', 'Sandwich', 'Mini Sandwich', 'Wrap',
        'Chips', 'Popcorn', 'Dried Fruit', 'Nuts', 'Sultanas'
    ],
    'Pantry': [
        'Rice', 'Brown Rice', 'Basmati Rice', 'Pasta', 'Gnocchi', 'Spaghetti', 'Macaroni', 'Flour',
        'Sugar', 'Brown Sugar', 'Canned Food', 'Canned Tomatoes', 'Canned Beans',
        'Spices', 'Salt', 'Pepper', 'Paprika', 'Curry Powder', 'Oil', 'Olive Oil', 'Vegetable Oil',
        'Vinegar', 'Honey', 'Peanut Butter', 'Jam', 'Cereal', 'Oats', 'Granola', 'Coffee', 'Tea', 'Expressi',
        'Red Bull', 'Sauce', 'Kellogg\'s'
    ],
    'International Foods': [
        'Soy Sauce', 'Curry Paste', 'Tortilla', 'Noodles', 'Rice Noodles', 'Soba Noodles',
        'Sushi', 'Wasabi', 'Miso', 'Tikka Masala', 'Hoisin Sauce', 'Teriyaki Sauce',
        'Pita Bread', 'Falafel', 'Hummus', 'Pad Thai', 'Kimchi', 'Gyoza', 'Spring Roll', 'Dolmades'
    ],
    'Snacks & Confectionery': [
        'Chips', 'Potato Chips', 'Chocolate', 'Dark Chocolate', 'Milk Chocolate', 'Candy',
        'Biscuits', 'Cookies', 'Lollies', 'Marshmallows', 'Popcorn', 'Nuts', 'Trail Mix',
        'Pretzels', 'Chewing Gum', 'Mints', 'Allen\'s', 'Maltesers', 'M&M\'s', 'Biscuit'
    ],
    'Freezer': [
        'Frozen Pizza', 'Frozen Vegetables', 'Frozen Chips', 'Frozen Fish', 'Frozen Peas',
        'Frozen Corn', 'Ice Cream', 'Sorbet', 'Frozen Yogurt', 'Frozen Chicken',
        'Frozen Sausages', 'Frozen Meatballs', 'Frozen Prawns', 'Frozen Spring Rolls',
        'Frozen Dumplings', 'Frozen Fruit', 'Frozen Berries', 'Monarc Utopia'
    ]
}


# Function to classify items
def classify_item(item, category_dict):
    for category, keywords in category_dict.items():
        if any(keyword.lower() in item.lower() for keyword in keywords):
            return category
    return 'Miscellaneous'  # Default for unmatched items

# Define the cleanup function
def classify_item_cleanup(item, category):
    # Check if 'Dog Food' or 'Pet' is in the description
    if 'dog food' in item.lower() or 'pet' in item.lower():
        return 'Miscellaneous'
    return category  # Otherwise, return the original category

# Function to classify items with cleanup
def classify_item_with_cleanup(item, category_dict):
    # Classify the item using the existing classify_item function
    category = classify_item(item, category_dict)
    # Clean up the classification if necessary
    return classify_item_cleanup(item, category)

# Apply the classification function with cleanup
data['Category'] = data['Description'].apply(lambda x: classify_item_with_cleanup(x, categories))

# Display the categorized DataFrame
print(data)

                                           Description Amount   Price  \
1    Ready, Set… Cook! Marinated Whole RSPCA Approv...    1kg   $5.99   
2                      Ready, Set… Cook! Thin Sausages   500g   $3.69   
3                      Ready, Set… Cook! Pork Sausages   500g   $3.69   
4        Ready, Set… Cook! Beef Sausages Honey Flavour   500g   $3.69   
5                   Jindurra Station Beef Sizzle Steak    1kg  $22.99   
..                                                 ...    ...     ...   
157                 Silvesters Cat Food Meat Varieties   400g   $1.49   
158                 Silvesters Cat Food Fish Varieties   400g   $1.49   
159        Julius Dog Food Casserole with Beef & Gravy   700g   $2.29   
160             Julius Dry Dog Food Beef and Vegetable    8kg  $17.49   
161                             Supercoat Dry Dog Food    6kg  $24.99   

                                             Image URL  \
1    https://www.aldi.com.au/fileadmin/_processed_/...   
2    ht

In [103]:
data.to_excel('output.xlsx', index=False)
#data.to_csv('output.csv', index=False)

In [62]:
!pip install pymongo



In [66]:
# from pymongo import MongoClient

# client = MongoClient('mongodb+srv://discountmate_read_and_write:discountmate@discountmatecluster.u80y7ta.mongodb.net/?retryWrites=true&w=majority&appName=DiscountMateCluster')

# db = client['ScrappedData']
# collection = db['2024-13-11 Aldi Data']

# document = {"name":"sktech", "city":"pune"}
# insert_doc = collection.insert_one(document)

# print("Data inserted successfully.")
# client.close()

Data inserted successfully.


In [104]:
from pymongo import MongoClient

client = MongoClient('mongodb+srv://discountmate_read_and_write:discountmate@discountmatecluster.u80y7ta.mongodb.net/?retryWrites=true&w=majority&appName=DiscountMateCluster')

db = client['ScrappedData']
collection = db['2024-13-11 Aldi Data']

# 4. Convert DataFrame to dictionary (MongoDB expects data in this format)
data_dict = data.to_dict(orient='records')

# 5. Insert data into MongoDB collection
insert_doc = collection.insert_many(data_dict)

print("Data inserted successfully.")
client.close()

Data inserted successfully.
