In [1]:
import pandas as pd
import numpy as np

PROCESSED_DATA_PATH = "C:/Project/UK store analysis/data/02_processed/canonical_products_e5.parquet"
df = pd.read_parquet(PROCESSED_DATA_PATH)

df = df.sort_values(by=["supermarket", "canonical_name", "date"]).reset_index(drop=True)

print("Data Loaded and sorted successfully")
df.head()

Data Loaded and sorted successfully


Unnamed: 0,supermarket,prices,prices_unit,unit,names,date,category,own_brand,normalized_name,canonical_name
0,ASDA,1.29,3.9,l,,2024-01-11,food_cupboard,False,,
1,ASDA,,,,,2024-01-12,fresh_food,False,,
2,ASDA,,,,,2024-01-13,fresh_food,False,,
3,ASDA,1.29,3.9,l,,2024-01-15,food_cupboard,False,,
4,ASDA,1.29,3.9,l,,2024-01-17,food_cupboard,False,,


## A. Rooling Price Statistics

In [2]:
windows = [7, 14, 30]

grouped = df.groupby(["supermarket", "canonical_name"])

for window in windows:
    # Rolling mean (price trend)
    df[f'price_rol_mean_{window}d'] = grouped['prices'].transform(
        lambda x: x.rolling(window, min_periods=1).mean()
    )
    # Rolling standard deviation (price volatility)
    df[f'price_rol_std_{window}d'] = grouped['prices'].transform(
        lambda x: x.rolling(window, min_periods=1).std()
    )
    # Rolling min/max (recent price range)
    df[f'price_rol_min_{window}d'] = grouped['prices'].transform(
        lambda x: x.rolling(window, min_periods=1).min()
    )
    df[f'price_rol_max_{window}d'] = grouped['prices'].transform(
        lambda x: x.rolling(window, min_periods=1).max()
    )

# Fill initial NaN values from rolling std with 0
df.fillna({col: 0 for col in df.columns if 'rol_std' in col}, inplace=True)

print("Created rolling price statistics features")

Created rolling price statistics features


# B. Price Momentum and Lag Features

In [3]:
lags = [1, 7]
for lag in lags:
    df[f"price_lag_{lag}d"] = grouped["prices"].transform(lambda x: x.shift(lag))

# Price difference
df["price_diff_1d"] = grouped["prices"].transform(lambda x: x.diff(1))

# Fill NaNs created by shift/diff
df.fillna({col: 0 for col in df.columns if "lag" in col or "diff" in col}, inplace=True)

print("Created price momentum and lag features")

Created price momentum and lag features


# 3. Feature Creation: Capturing Competitive Landscape

## A. Daily Market Price Comparison

In [4]:
daily_market_stats = df.groupby(['canonical_name', 'date'])["prices"].agg(['mean', 'std', 'min', 'max']
).rename(columns={
    'mean': 'market_avg_price',
    'std': 'market_std_price',
    'min': 'market_min_price',
    'max': 'market_max_price'

}).reset_index()

# Merge these market side stats back into main dataframe
df = pd.merge(df, daily_market_stats, on=["canonical_name", "date"], how="left")

# --- Create Competitiveness Scores ---
# Price vs. Market Average: A -ve value means cheaper than average
df["price_vs_market_avg"] = df["prices"] - df["market_avg_price"]

df["price_rank"] = df.groupby(['canonical_name', 'date'])["prices"].rank(method="min")

# binary feature
df["is_cheapest"] = (df["prices"] == df["market_min_price"]).astype(int)

print("Created market competitiveness features.")

Created market competitiveness features.


# 4. Feature Creation: Temporal Features

In [5]:
df["day_of_week"] = df["date"].dt.dayofweek
df["day_of_month"] = df["date"].dt.day
df["week_of_year"] = df["date"].dt.isocalendar().week.astype(int)
df["month"] = df["date"].dt.month

print("Created temporal features")

Created temporal features


# 5. Save final present

In [7]:
print("Features inspection for one product:")
print(df[df["canonical_name"] == "heinz tomato ketchup bottle"].tail().T)

# Save
FINAL_DATA_PATH = "C:/Project/UK store analysis/data/02_processed/feature_engineered_data.parquet"
df.to_parquet(FINAL_DATA_PATH)

print(f"\nFeature engineering complete. Data saved to {FINAL_DATA_PATH}")

Features inspection for one product:
                                              8346682  \
supermarket                                     Tesco   
prices                                            2.5   
prices_unit                                       7.3   
unit                                               kg   
names                Heinz Tomato Ketchup Bottle 342G   
date                              2024-04-11 00:00:00   
category                                food_cupboard   
own_brand                                       False   
normalized_name           heinz tomato ketchup bottle   
canonical_name            heinz tomato ketchup bottle   
price_rol_mean_7d                            2.285714   
price_rol_std_7d                             0.267261   
price_rol_min_7d                                  2.0   
price_rol_max_7d                                  2.5   
price_rol_mean_14d                               2.25   
price_rol_std_14d                            0.2594