In [None]:
import os
import json
import pandas as pd
import zipfile
from datetime import datetime
from ruamel.yaml import YAML

# === Variables ===
# Path to YAML config file
config_file_path = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\3) Code\config.yaml"

# YAML handler
yaml_handler = YAML()
yaml_handler.preserve_quotes = True
yaml_handler.width = 4096  # prevent wrapping of long strings

# === Functions ===

# Function to load config from YAML
def load_config(config_path):
    with open(config_path, 'r') as file:
        return yaml_handler.load(file)

# Function to load last unzip timestamp
def load_last_unzip_time(config_data):
    return datetime.fromisoformat(config_data['variables']['opener_variables']['last_unzip'])

# Function to update the unzip timestamp in config
def update_last_unzip_time(config_path, config_data, updated_timestamp):
    config_data['variables']['opener_variables']['last_unzip'] = updated_timestamp.isoformat()
    with open(config_path, 'w') as file:
        yaml_handler.dump(config_data, file)

# Function to unzip new or modified files
def unzip_new_or_updated_files(zip_folder, unzipped_output_root_folder, last_unzip_timestamp):
    newly_unzipped_list = []
    latest_timestamp = last_unzip_timestamp
    for file_name in os.listdir(zip_folder):
        if file_name.lower().endswith('.zip'):
            zip_file_path = os.path.join(zip_folder, file_name)
            file_modified_time = datetime.fromtimestamp(os.path.getmtime(zip_file_path))
            if file_modified_time > last_unzip_timestamp:
                destination_folder = os.path.join(unzipped_output_root_folder, os.path.splitext(file_name)[0])
                newly_unzipped_list.append(destination_folder)
                os.makedirs(destination_folder, exist_ok=True)
                with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                    zip_ref.extractall(destination_folder)
                if file_modified_time > latest_timestamp:
                    latest_timestamp = file_modified_time
    return latest_timestamp, newly_unzipped_list

def grab_UIDs(df_fact_transactions, df_all):
    # Create a lower-case, stripped join key for case-insensitive matching
    df_fact_transactions['key'] = df_fact_transactions['name'].str.lower().str.strip()
    df_all['key'] = df_all['product name'].str.lower().str.strip()

    # Perform a left join to assign UIDs to each fact transaction
    df_join = pd.merge(df_fact_transactions, df_all[['UID', 'key']], on='key', how='left')

    # Identify the keys that did not get a UID (i.e. missing matches)
    missing_keys = df_join.loc[df_join['UID'].isna(), 'key'].unique()

    if missing_keys.size:
        # For missing keys, get the original product name (preserving capitalization)
        new_products = df_fact_transactions[df_fact_transactions['key'].isin(missing_keys)] \
                    .drop_duplicates('key')[['name', 'key']]
        
        # Determine the current maximum UID number in df_all; default to 0 if empty
        max_id = df_all['UID'].str.extract(r'ID_(\d+)')[0].astype(int).max() if not df_all.empty else 0
        
        # Generate new UIDs for the missing keys
        new_products['UID'] = ['ID_' + str(i) for i in range(max_id + 1, max_id + 1 + len(new_products))]
        
        # Rename 'name' to 'product name' to match df_all structure
        new_products = new_products.rename(columns={'name': 'product name'})
        
        # Append the new product rows (with UID and key) to df_all
        df_all = pd.concat([df_all, new_products[['UID', 'product name', 'key']]], ignore_index=True)

    # Re-join so every fact transaction now has a valid UID
    df_fact_transactions = pd.merge(df_fact_transactions, df_all[['UID', 'key']], on='key', how='left').drop(columns='key')
    df_all = df_all.drop(columns='key')

    return df_fact_transactions, df_all

# === Execution ===
config_data = load_config(config_file_path)  # load full config
last_unzip_timestamp = load_last_unzip_time(config_data)  # load last unzip time

zip_folder = os.path.join(config_data['file_paths']['opener_paths']['input_root_folder'], "1) ZIPs")  # ZIP folder path
unzipped_output_root_folder = config_data['file_paths']['opener_paths']['unzipped_output_folder']  # unzipped destination folder
final_output_root_folder = config_data['file_paths']['opener_paths']['opened_output_folder']  # data destination folder
all_items_file = config_data['file_paths']['main_paths']['all_item_input_file']  # master item list

df_all = pd.read_excel(all_items_file)

latest_unzip_timestamp, newly_unzipped_list = unzip_new_or_updated_files(zip_folder, unzipped_output_root_folder, last_unzip_timestamp)
update_last_unzip_time(config_file_path, config_data, latest_unzip_timestamp)


In [None]:
def extract_data(newly_unzipped_list):
    for unzipped_folder in newly_unzipped_list:

        # Grab the json inside this folder
        unzipped_file = next(f for f in os.listdir(unzipped_folder) if f.endswith('.json'))

        # Load the JSON file
        with open(os.path.join(unzipped_folder, unzipped_file), "r") as f:
            data = json.load(f)

        # Top-level identifiers
        customerId = data.get('customerId')
        requestId = data.get('requestId')

        # Lists to hold our records
        fact_transactions = []
        dim_basket = []

        # Process each purchase in the 'purchases' list
        for purchase in data.get('purchases', []):
            # Extract basket-level (DIM Basket) information
            timestamp = purchase.get('timestamp')
            purchase_type = purchase.get('type')
            says = purchase.get('says')
            basketValueGross = purchase.get('basketValueGross')
            overallBasketSavings = purchase.get('overallBasketSavings')
            basketValueNet = purchase.get('basketValueNet')
            numberOfItems = purchase.get('numberOfItems')
            
            # Assume first element of payment list holds our payment info
            payment = purchase.get('payment', [])
            if payment:
                payment_record = payment[0]
                payment_type = payment_record.get('type')
                payment_category = payment_record.get('category')
                payment_amount = payment_record.get('amount')
            else:
                payment_type = payment_category = payment_amount = None

            # Create a DIM Basket record
            basket_record = {
                'timestamp': timestamp,
                'type': purchase_type,
                'says': says,
                'basketValueGross': basketValueGross,
                'overallBasketSavings': overallBasketSavings,
                'basketValueNet': basketValueNet,
                'numberOfItems': numberOfItems,
                'payment_type': payment_type,
                'payment_category': payment_category,
                'payment_amount': payment_amount,
                'customerId': customerId,
                'requestId': requestId,
            }
            dim_basket.append(basket_record)
            
            # Process each item in the 'items' list for Fact Transactions
            for item in purchase.get('items', []):
                fact_record = {
                    'name': item.get('name'),
                    'quantity': item.get('quantity'),
                    'price': item.get('price'),
                    'volume': item.get('volume'),
                    'timestamp': timestamp  # from basket level
                }
                fact_transactions.append(fact_record)

        # Convert lists to DataFrames
        df_fact_transactions = pd.DataFrame(fact_transactions)
        df_dim_basket = pd.DataFrame(dim_basket)

        # Construct final data output path
        data_output_path = os.path.join(final_output_root_folder, unzipped_file)
        data_output_path = data_output_path.replace(".json", "")

        # Collect UIDs or extend master list if new products
        df_fact_transactions, df_all = grab_UIDs(df_fact_transactions, df_all)

        # Save to CSV
        os.makedirs(data_output_path, exist_ok=True)
        df_fact_transactions.to_csv(os.path.join(data_output_path, "FACT_transactions.csv"), index=False)
        df_dim_basket.to_csv(os.path.join(data_output_path, "DIM_basket.csv"), index=False)