### Necessary Imports

In [39]:
import json
import re
import os

### Flatten Nested JSON Lists

This cell defines a function that takes a file containing multiple JSON lists, for example:

```json
[ {...} ], [ {...} ], ...
```
and flattens all contained objects into a single Python list.

In [36]:
def flatted_nested_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    text = re.sub(r'\s+', ' ', text.strip())
    parts = re.split(r'\],\s*\[', text)

    all_entries = []
    for part in parts:
        cleaned = part.strip().lstrip(',').lstrip('[').rstrip('],').rstrip(',')
        clearned_json = f'[{cleaned}]'

        objs = json.loads(clearned_json)
        if isinstance(objs, list):
            all_entries.extend(objs)

    return all_entries


plan_prices = flatted_nested_json("data/plan_prices.jsonl")
brand_migration_deals = flatted_nested_json("data/brand_migration_deals.jsonl")

### Clean and Normalize Plan Data
This cell takes the flattened JSON entries and:
- Converts 'international_minutes' to integer, or `inf` for unlimited
- Normalizes 'offer_expiry' and price fields
- Handles missing or N/A values


In [37]:
def clean_plan_entries(entries):
    
    def normalize_value(val):
        if isinstance(val, str) and val.strip().upper() in ["N/A", "NA", "", "NONE"]:
            return None
        return val
    
    for obj in entries:
        # --- Normalize offer_expiry ---
        if "offer_expiry" in obj:
            obj["offer_expiry"] = normalize_value(obj["offer_expiry"])

        # --- Normalize price fields ---
        for price_field in ["price_per_month", "original_price"]:
            val = obj.get(price_field)

            if isinstance(val, str):
                match = re.search(r'\d+(?:\.\d+)?', val)
                obj[price_field] = float(match.group()) if match else None

            elif isinstance(val, (int, float)):
                obj[price_field] = float(val)

            else:
                obj[price_field] = None
        
        # --- Normalize international_minutes ---
        calling = obj.get("calling", {})
        val = calling.get("international_minutes")

        if isinstance(val, (int, float)):
            calling["international_minutes"] = int(val)

        elif isinstance(val, str):
            val_clean = val.strip().lower()

            if "unlimited" in val_clean:
                calling["international_minutes"] = float('inf')
            else:
                match = re.search(r'\d+', val_clean)
                calling["international_minutes"] = int(match.group()) if match else None

        else:
            calling["international_minutes"] = None

        obj["calling"] = calling

    return entries


plan_prices = clean_plan_entries(plan_prices)
brand_migration_deals = clean_plan_entries(brand_migration_deals)


### Save JSON Objects as JSONL

The `save_jsonl` function takes a list of JSON objects (dictionaries) and writes them to a file in **JSONL format**, where each object appears on a separate line.  

In [38]:
def save_jsonl(entries, file_path=None):
    if file_path is None:
        file_path = "flattened_output.jsonl"

    else:
        base, ext = os.path.splitext(file_path)
        if ext.lower() != ".jsonl":
            file_path = base + "_flattened.jsonl"

    with open(file_path, "w", encoding="utf-8") as f:
        for obj in entries:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

    return file_path


save_jsonl(plan_prices, file_path="data/plan_prices")
save_jsonl(brand_migration_deals, file_path="data/brand_migration_deals")

'data/brand_migration_deals_flattened.jsonl'