In [None]:
# They write cl like 75c3 and litre like lt, 100ml, 100g, kg, each and 10g


In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [3]:
df = pd.read_csv("Asda.csv")

In [4]:
df.head()

Unnamed: 0,Name,Price,Price per Unit,Category,Subcategory,Date
0,ASDA Sweet & Plump Raspberries,£2.00,(£13.33/kg),fresh_food,fruits,2024-11-19
1,ASDA Sweet & Juicy Rainbow Fruit Platter,£3.50,(£8.33/kg),fresh_food,fruits,2024-11-19
2,ASDA 6 Sweet & Creamy Bananas,£0.94,(15.7p/each),fresh_food,fruits,2024-11-19
3,JUST ESSENTIALS by ASDA Pears (Colour and Vari...,£0.99,(£1.98/kg),fresh_food,fruits,2024-11-19
4,JUST ESSENTIALS by ASDA Raspberries,£1.79,(£11.93/kg),fresh_food,fruits,2024-11-19


In [4]:
# Convert 'Date' column to datetime format (this automatically handles the conversion to day, month, and year)
df['Date'] = pd.to_datetime(df['Date'])

# Split the 'Date' column into individual components: Year, Month, Day
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
# Drop the original 'Date' column if it's no longer needed
df = df.drop(columns=['Date'])

In [6]:
# Remove the '£' sign and convert to float
df['Price'] = df['Price'].replace('£', '', regex=True).astype(float)

In [6]:
def standardize_price_per_unit(price_per_unit):
    if isinstance(price_per_unit, str):  # Ensure the value is a string
        
        # Remove any parentheses around the price value (if present)
        price_per_unit = price_per_unit.replace('(', '').replace(')', '')
        
        # Remove the 'was' keyword and any leading/trailing whitespace or line breaks
        price_per_unit = price_per_unit.lower().replace('was', '').strip()

        # Split into price and unit if '/' is in the string
        if '/' in price_per_unit:
            try:
                price_value, unit = price_per_unit.split('/')  # Split into price and unit
                
                # Remove any commas in the price (to handle values like '12,000.00')
                price_value = price_value.replace(',', '')  # Remove commas for thousands
                
                # If the price has a 'p' (e.g., '15.7p'), remove it and convert to pounds (e.g., '15.7p' -> '0.0157')
                if 'p' in price_value:
                    price_value = float(price_value.replace('p', '').strip()) / 100  # Convert pence to pound
                elif '£' in price_value:
                    price_value = float(price_value.replace('£', '').strip())  # Convert price to float and remove '£'
                
                # Unit conversions based on the specific units
                if '100g' in unit:  # Convert 100g to kg
                    price_value = price_value * 10  # 100g is 0.1kg, so we multiply price by 10
                    unit = 'kg'
                elif '10g' in unit:
                    price_value = price_value * 100  # 10g to kg
                    unit = 'kg'
                elif 'kg' in unit:  # No conversion needed
                    unit = 'kg'
                elif '100ml' in unit:  # Convert 100ml to litre
                    price_value = price_value * 10  # 100ml is 0.1l, so we multiply price by 10
                    unit = 'litre'
                elif 'lt' in unit:  # No conversion needed
                    unit = 'litre'
                elif '75c3' in unit:  # Convert cl to litre (e.g., 75cl to 0.75l)
                    price_value = price_value / 10  # 75cl = 0.75l, so divide by 10
                    unit = 'litre'
                elif 'each' in unit:  # Handle 'each' (e.g., '5.20 each')
                    unit = 'each'

                # Handle edge case for prices like '12,000.00' (typo, should be '12.00')
                if price_value > 1000:
                    price_value = price_value / 1000  # Fix the typo, converting to correct value (e.g., 12000 becomes 12.00)

                return price_value, unit
            except ValueError:
                    # Handle splitting errors
                    return np.nan, 'other'

        else:
            return np.nan, np.nan  # Handle rows without valid format
    
    else:
        return np.nan, np.nan  # If the value is not a string, return NaN for both price and unit




In [7]:
# Apply the function to 'Price per Unit' column
df[['Standardised Price per Unit', 'Unit']] = df['Price per Unit'].apply(
    lambda x: pd.Series(standardize_price_per_unit(x))
)

# Verify the distinct units captured
print("Distinct Units:", df['Unit'].unique())

Distinct Units: ['kg' 'each' nan 'litre']


In [8]:
print(df['Unit'])

0          kg
1          kg
2        each
3          kg
4          kg
         ... 
11622     NaN
11623     NaN
11624     NaN
11625     NaN
11626     NaN
Name: Unit, Length: 11627, dtype: object


In [20]:
# Apply the function to the dataframe
df[['Standardised price per unit', 'Unit']] = df['Price per Unit'].apply(
    lambda x: pd.Series(standardize_price_per_unit(x))
)

# Filter out invalid unit values (only keep valid units)
df = df[df['Unit'].isin(['kg', 'litre', 'each'])]


In [21]:
print(df['Unit'].isna().sum())

0


In [35]:
df.head()

Unnamed: 0,Name,Price,Price per Unit,Category,Subcategory,Date,Standardised price per unit,Unit


In [48]:
# Apply the function to the dataframe
df[['Standardised price per unit', 'Unit']] = df['Price per Unit'].apply(
    lambda x: pd.Series(standardize_price_per_unit(x))
)

# Filter out invalid unit values (only keep valid units)
df = df[df['Unit'].isin(['kg', 'litre', 'each'])]

# One-hot encode the 'Unit' column
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')  # Avoid errors for unknown units
unit_encoded = encoder.fit_transform(df[['Unit']])

# Create new columns based on one-hot encoding
unit_columns = encoder.get_feature_names_out(['Unit'])
df[unit_columns] = unit_encoded

# Drop the original 'Unit' and 'Price per Unit' columns
df.drop(columns=['Unit', 'Price per Unit'], inplace=True)

# Display the cleaned dataframe
print(df.head())


                                                Name  Price    Category  \
0                     ASDA Sweet & Plump Raspberries  £2.00  fresh_food   
1           ASDA Sweet & Juicy Rainbow Fruit Platter  £3.50  fresh_food   
2                      ASDA 6 Sweet & Creamy Bananas  £0.94  fresh_food   
3  JUST ESSENTIALS by ASDA Pears (Colour and Vari...  £0.99  fresh_food   
4                JUST ESSENTIALS by ASDA Raspberries  £1.79  fresh_food   

  Subcategory        Date  Standardised price per unit  Unit_each  Unit_kg  \
0      fruits  2024-11-19                       13.330        0.0      1.0   
1      fruits  2024-11-19                        8.330        0.0      1.0   
2      fruits  2024-11-19                        0.157        1.0      0.0   
3      fruits  2024-11-19                        1.980        0.0      1.0   
4      fruits  2024-11-19                       11.930        0.0      1.0   

   Unit_litre  
0         0.0  
1         0.0  
2         0.0  
3         0.0  


