In [3]:
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import OneHotEncoder


In [25]:
df = pd.read_csv("Aldi.csv")

In [48]:
df.head()

Unnamed: 0,Name,Price,Price per Unit,Category,Subcategory,Date
0,Everyday Essentials Strawberries 227g,£1.69,£7.45 per kg,fresh_food,fruits,2024-11-19
1,Nature's Pick Bananas 5 Pack,£0.78,£0.16 each,fresh_food,fruits,2024-11-19
2,Specially Selected Easy Peelers 600g,£1.69,£2.82 per kg,fresh_food,fruits,2024-11-19
3,The Foodie Market Raspberry Dinos 20g,£0.33,£1.65 per 100g,fresh_food,fruits,2024-11-19
4,The Foodie Market Strawberry Dinos 20g,£0.33,£1.65 per 100g,fresh_food,fruits,2024-11-19


In [26]:
p = df[df["Price"].str.contains('p', na=False)]
p

Unnamed: 0,Name,Price,Price per Unit,Category,Subcategory,Date


In [27]:
# Convert 'Date' column to datetime format (this automatically handles the conversion to day, month, and year)
df['Date'] = pd.to_datetime(df['Date'])

# Split the 'Date' column into individual components: Year, Month, Day
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
# Drop the original 'Date' column if it's no longer needed
df = df.drop(columns=['Date'])

In [29]:
# Remove the '£' sign and convert to float
df['Price'] = df['Price'].replace('£', '', regex=True).astype(float)

In [None]:
def standardize_price_per_unit(price_per_unit):
    """
    Converts price per unit to standardized float value and unit ('kg', 'litre', 'each').
    Handles specific formats like '£5 per kg', '£5 per 100g', etc.
    """
    if isinstance(price_per_unit, str):  # Ensure the value is a string
        price_per_unit = price_per_unit.strip()  # Remove leading/trailing spaces
        
        if 'per' in price_per_unit:  # Handle 'per' formats
            try:
                # Split the string into price and unit
                price_value, unit = price_per_unit.split(' per ')
                price_value = price_value.strip()  # Clean whitespace
                unit = unit.strip()  # Clean whitespace
                
                if 'p' in price_value:
                    price_value = float(price_value.replace('p', '').strip()) / 100  # Convert pence to pound
                elif '£' in price_value:
                    price_value = float(price_value.replace('£', '').strip())  # Convert price to float and remove '£'
            
                # Handle specific unit conversions
                if '100g' in unit:  # Convert 100g to kg
                    price_value *= 10  # 100g is 0.1kg
                    unit = 'kg'
                elif '10g' in unit:
                    price_value *= 100
                    unit = 'kg'
                elif 'kg' in unit:  # No conversion needed
                    unit = 'kg'
                elif '100ml' in unit:  # Convert 100ml to litre
                    price_value *= 10  # 100ml is 0.1 litre
                    unit = 'litre'
                elif '75cl' in unit:
                    price_value *= (4 / 3) 
                    unit = 'litre'
                elif 'litre' in unit:  # No conversion needed
                    unit = 'litre'
                elif 'each' in unit:  # Handle 'each'
                    unit = 'each'
                    
                return price_value, unit
            except ValueError:
                # Handle splitting errors
                return np.nan, 'other'
        elif 'each' in price_per_unit:  # Handle '£5 each' format
            try:
                price_value = float(price_per_unit.replace('£', '').replace('each', '').strip())
                return price_value, 'each'
            except ValueError:
                return np.nan, 'other'
    
    return np.nan, np.nan  # Return NaN for invalid or missing values




In [28]:
# Apply the function to 'Price per Unit' column
df[['Standardised Price per Unit', 'Unit']] = df['Price per Unit'].apply(
    lambda x: pd.Series(standardize_price_per_unit(x))
)

# Verify the distinct units captured
print("Distinct Units:", df['Unit'].unique())


Distinct Units: ['kg' 'each' 'litre']


In [14]:
# Perform one-hot encoding
encoder = OneHotEncoder(sparse=False, drop='if_binary')
unit_encoded = encoder.fit_transform(df[['Unit']])

# Add one-hot encoded columns to the dataframe
unit_columns = encoder.get_feature_names_out(['Unit'])
df[unit_columns] = unit_encoded

# Drop the original 'Unit' column
df.drop(columns=['Unit'], inplace=True)




In [15]:
print(df.head())


                                     Name  Price  Price per Unit    Category  \
0   Everyday Essentials Strawberries 227g  £1.69    £7.45 per kg  fresh_food   
1            Nature's Pick Bananas 5 Pack  £0.78      £0.16 each  fresh_food   
2    Specially Selected Easy Peelers 600g  £1.69    £2.82 per kg  fresh_food   
3   The Foodie Market Raspberry Dinos 20g  £0.33  £1.65 per 100g  fresh_food   
4  The Foodie Market Strawberry Dinos 20g  £0.33  £1.65 per 100g  fresh_food   

  Subcategory        Date  Standardised Price per Unit  Unit_each  Unit_kg  \
0      fruits  2024-11-19                         7.45        0.0      1.0   
1      fruits  2024-11-19                         0.16        1.0      0.0   
2      fruits  2024-11-19                         2.82        0.0      1.0   
3      fruits  2024-11-19                        16.50        0.0      1.0   
4      fruits  2024-11-19                        16.50        0.0      1.0   

   Unit_litre  Unit_other  
0         0.0         

In [10]:
# Apply the function to the 'Price per Unit' column with debugging
df[['Standardised Price per Unit', 'Unit']] = df['Price per Unit'].apply(
    lambda x: pd.Series(standardize_price_per_unit(x))
)

# Check for distinct units captured
print("Distinct Units:", df['Unit'].unique())

# Filter valid units
valid_units = ['kg', 'litre', 'each']  # Define units of interest
df = df[df['Unit'].isin(valid_units)]

# One-hot encode the 'Unit' column
encoder = OneHotEncoder(sparse=False, drop='if_binary')
unit_encoded = encoder.fit_transform(df[['Unit']])

# Add the one-hot encoded columns to the dataframe
unit_columns = encoder.get_feature_names_out(['Unit'])
df[unit_columns] = unit_encoded

# Drop unnecessary columns
df.drop(columns=['Unit', 'Price per Unit'], inplace=True)

# Display the cleaned DataFrame
print(df.head())


Distinct Units: ['kg' nan 'litre' 'other']
                                       Name  Price    Category Subcategory  \
0     Everyday Essentials Strawberries 227g  £1.69  fresh_food      fruits   
2      Specially Selected Easy Peelers 600g  £1.69  fresh_food      fruits   
3     The Foodie Market Raspberry Dinos 20g  £0.33  fresh_food      fruits   
4    The Foodie Market Strawberry Dinos 20g  £0.33  fresh_food      fruits   
5  The Foodie Market Strawberry Dinos 5x20g  £1.69  fresh_food      fruits   

         Date  Standardised Price per Unit  Unit_litre  
0  2024-11-19                         7.45         0.0  
2  2024-11-19                         2.82         0.0  
3  2024-11-19                        16.50         0.0  
4  2024-11-19                        16.50         0.0  
5  2024-11-19                        16.90         0.0  


