In [41]:
import pandas as pd
import re
import numpy as np


In [52]:
df = pd.read_csv("Aldi.csv")

In [48]:
df.head()

Unnamed: 0,Name,Price,Price per Unit,Category,Subcategory,Date
0,Everyday Essentials Strawberries 227g,£1.69,£7.45 per kg,fresh_food,fruits,2024-11-19
1,Nature's Pick Bananas 5 Pack,£0.78,£0.16 each,fresh_food,fruits,2024-11-19
2,Specially Selected Easy Peelers 600g,£1.69,£2.82 per kg,fresh_food,fruits,2024-11-19
3,The Foodie Market Raspberry Dinos 20g,£0.33,£1.65 per 100g,fresh_food,fruits,2024-11-19
4,The Foodie Market Strawberry Dinos 20g,£0.33,£1.65 per 100g,fresh_food,fruits,2024-11-19


In [26]:
p = df[df["Price"].str.contains('p', na=False)]
p

Unnamed: 0,Name,Price,Price per Unit,Category,Subcategory,Date


In [27]:
# Convert 'Date' column to datetime format (this automatically handles the conversion to day, month, and year)
df['Date'] = pd.to_datetime(df['Date'])

# Split the 'Date' column into individual components: Year, Month, Day
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
# Drop the original 'Date' column if it's no longer needed
df = df.drop(columns=['Date'])

In [29]:
# Remove the '£' sign and convert to float
df['Price'] = df['Price'].replace('£', '', regex=True).astype(float)

In [53]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

def standardize_price_per_unit(price_per_unit):
    if isinstance(price_per_unit, str):  # Ensure the value is a string
        
        # Split into price and unit if 'per' is in the string
        if 'per' in price_per_unit:
            price_value, unit = price_per_unit.split(' per ')  # Split into price and unit
            
            # Check if the price has a 'p' and remove it (e.g., '0.70p' becomes '£0.70')
            if 'p' in price_value:
                # If there's no decimal point, we assume it's in whole pence (e.g., '70p' -> '0.70')
                if '.' not in price_value:
                    price_value = f"£{float(price_value.replace('p', '').strip()) / 100:.2f}"
                else:
                    price_value = price_value.replace('p', '£')  # Replace 'p' with '£'
            
            price_value = float(price_value.replace('£', '').strip())  # Convert price to float and remove '£'
            
            # Unit conversions based on the specific units
            if '100g' in unit:  # Convert 100g to kg
                price_value = price_value * 10  # 100g is 0.1kg, so we multiply price by 10
                unit = 'kg'
            elif 'kg' in unit:  # No conversion needed
                unit = 'kg'
            elif '100ml' in unit:  # Convert 100ml to litre
                price_value = price_value * 10  # 100ml is 0.1l, so we multiply price by 10
                unit = 'litre'
            elif 'litre' in unit:  # No conversion needed
                unit = 'litre'
            elif 'cl' in unit:  # Convert cl to litre (e.g., 75cl to 0.75l)
                price_value = price_value / 10  # 75cl = 0.75l, so divide by 10
                unit = 'litre'
            elif 'each' in unit:  # Handle 'each' (e.g., '5.20 each')
                unit = 'each'

            return price_value, unit

        else:
            return np.nan, np.nan  # Handle rows without valid format
    else:
        return np.nan, np.nan  # If the value is not a string, return NaN for both price and unit

# Apply the function to the dataframe
df[['Standardised price per unit', 'Unit']] = df['Price per Unit'].apply(
    lambda x: pd.Series(standardize_price_per_unit(x))
)

# Filter out invalid unit values (only keep valid units)
df = df[df['Unit'].isin(['kg', 'litre', 'each'])]

# One-hot encode the 'Unit' column
encoder = OneHotEncoder(sparse=False, drop='if_binary')  # Drop 'unit_nan' column if it exists
unit_encoded = encoder.fit_transform(df[['Unit']])

# Create new columns based on one-hot encoding
unit_columns = encoder.get_feature_names_out(['Unit'])
df[unit_columns] = unit_encoded

# Drop the original 'Unit' and 'Price per Unit' columns
df.drop(columns=['Unit', 'Price per Unit'], inplace=True)

# Display the cleaned dataframe
print(df.head())


                                       Name  Price    Category Subcategory  \
0     Everyday Essentials Strawberries 227g  £1.69  fresh_food      fruits   
2      Specially Selected Easy Peelers 600g  £1.69  fresh_food      fruits   
3     The Foodie Market Raspberry Dinos 20g  £0.33  fresh_food      fruits   
4    The Foodie Market Strawberry Dinos 20g  £0.33  fresh_food      fruits   
5  The Foodie Market Strawberry Dinos 5x20g  £1.69  fresh_food      fruits   

         Date  Standardised price per unit  Unit_litre  
0  2024-11-19                         7.45         0.0  
2  2024-11-19                         2.82         0.0  
3  2024-11-19                        16.50         0.0  
4  2024-11-19                        16.50         0.0  
5  2024-11-19                        16.90         0.0  


