### Necessary Imports

In [39]:
import json
import re
import os
from openai import OpenAI
from dotenv import load_dotenv

### Flatten Nested JSON Lists

This cell defines a function that takes a file containing multiple JSON lists, for example:

```json
[ {...} ], [ {...} ], ...
```
and flattens all contained objects into a single Python list.

In [40]:
def flatted_nested_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    text = re.sub(r'\s+', ' ', text.strip())
    parts = re.split(r'\],\s*\[', text)

    all_entries = []
    for part in parts:
        cleaned = part.strip().lstrip(',').lstrip('[').rstrip('],').rstrip(',')
        clearned_json = f'[{cleaned}]'

        objs = json.loads(clearned_json)
        if isinstance(objs, list):
            all_entries.extend(objs)

    return all_entries


plan_prices = flatted_nested_json("data/plan_prices.jsonl")
brand_migration_deals = flatted_nested_json("data/brand_migration_deals.jsonl")

### Clean and Normalize Plan Data
This cell takes the flattened JSON entries and:
- Converts 'international_minutes' to integer, or `inf` for unlimited
- Normalizes 'offer_expiry' and price fields
- Handles missing or N/A values


In [41]:
def clean_plan_entries(entries):
    
    def normalize_value(val):
        if isinstance(val, str) and val.strip().upper() in ["N/A", "NA", "", "NONE"]:
            return None
        return val
    
    for obj in entries:
        # --- Normalize offer_expiry ---
        if "offer_expiry" in obj:
            obj["offer_expiry"] = normalize_value(obj["offer_expiry"])

        # --- Normalize price fields ---
        for price_field in ["price_per_month", "original_price"]:
            val = obj.get(price_field)

            if isinstance(val, str):
                match = re.search(r'\d+(?:\.\d+)?', val)
                obj[price_field] = float(match.group()) if match else None

            elif isinstance(val, (int, float)):
                obj[price_field] = float(val)

            else:
                obj[price_field] = None
        
        # --- Normalize international_minutes ---
        calling = obj.get("calling", {})
        val = calling.get("international_minutes")

        if isinstance(val, (int, float)):
            calling["international_minutes"] = int(val)

        elif isinstance(val, str):
            val_clean = val.strip().lower()

            if "unlimited" in val_clean:
                calling["international_minutes"] = float('inf')
            else:
                match = re.search(r'\d+', val_clean)
                calling["international_minutes"] = int(match.group()) if match else None

        else:
            calling["international_minutes"] = None

        obj["calling"] = calling

    return entries


plan_prices = clean_plan_entries(plan_prices)
brand_migration_deals = clean_plan_entries(brand_migration_deals)

### Save JSON Objects as JSONL

The `save_jsonl` function takes a list of JSON objects (dictionaries) and writes them to a file in **JSONL format**, where each object appears on a separate line.  

In [42]:
def save_jsonl(entries, file_path=None):
    directory = os.path.dirname(file_path)
    if directory:
        os.makedirs(directory, exist_ok=True)

    with open(file_path, "w", encoding="utf-8") as f:
        for obj in entries:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

    return file_path


save_jsonl(plan_prices, file_path="data/plan_prices_flattened.jsonl")
save_jsonl(brand_migration_deals, file_path="data/brand_migration_deals_flattened.jsonl")

'data/brand_migration_deals_flattened.jsonl'

### Convert Plan JSON to Text
This step uses the OpenAI API to turn each mobile plan JSON into a clear, customer-friendly text, skipping metadata.


In [43]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

def json_to_text(entry):
    entry_for_prompt = entry.copy()
    entry_for_prompt.pop("metadata", None)

    prompt = f"""
    Convert the following mobile plan JSON into a clear, long human-readable description.
    Include all relevant fields like provider, plan name, price, data, calling, texting, streaming, and additional perks. 
    Write it in full sentences suitable for a customer-friendly description.

    JSON:
    {json.dumps(entry_for_prompt, ensure_ascii=False)}
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system", 
                "content": (
                    "You are an expert at converting structured text into simple factual texts. " 
                    "Each entry should be a simple single line. "
                    "No need to write anything extra other than converting the entry to a simple text. "
                    "Do not create separate paragraphs."
                )
            },
            {
                "role": "user", 
                "content": prompt
            }
        ],
        max_completion_tokens=500
    )

    text_output = response.choices[0].message.content.strip()
    return text_output


plan_prices_texts = [json_to_text(entry) for entry in plan_prices]
brand_migrations_texts = [json_to_text(entry) for entry in brand_migration_deals]

### Combine Text with Metadata and Save
This step sanitizes the metadata by converting any lists into comma-separated strings and nested dicts into JSON strings, and then merges each generated plan text with its original metadata (including provider and plan name) and saves the results as JSONL files.

In [44]:
def sanitize_metadata(metadata):
    sanitized = {}

    for k, v in metadata.items():
        if isinstance(v, list):
            sanitized[k] = ", ".join(str(item) for item in v)
        elif isinstance(v, dict):
            sanitized[k] = json.dumps(v, ensure_ascii=False)
        else:
            sanitized[k] = v

    return sanitized


for entry in plan_prices:
    entry["metadata"] = sanitize_metadata(entry.get("metadata", {}))

for entry in brand_migration_deals:
    entry["metadata"] = sanitize_metadata(entry.get("metadata", {}))

In [45]:
def text_with_metadata(flattened_entries, text_list):
    combined = []

    for entry, text in zip(flattened_entries, text_list):
        metadata_copy = entry.get("metadata", {}).copy()
        metadata_copy["provider"] = entry.get("provider")
        metadata_copy["plan_name"] = entry.get("plan_name")

        combined.append({
            "text": text,
            "metadata": metadata_copy
        })

    return combined


plan_prices_texts = text_with_metadata(plan_prices, plan_prices_texts)
brand_migrations_texts = text_with_metadata(brand_migration_deals, brand_migrations_texts)

save_jsonl(plan_prices_texts, file_path="data/plan_prices_texts.jsonl")
save_jsonl(brand_migrations_texts, file_path="data/brand_migration_deals_texts.jsonl")

'data/brand_migration_deals_texts.jsonl'