In [None]:
# dynamically convert the data into that format.

In [456]:
from langchain.chains import create_extraction_chain_pydantic
from typing import List, Optional, Literal, Union
from pydantic import BaseModel, Field, HttpUrl
from enum import Enum
from neo4j import GraphDatabase
import uuid
import json
import re
import google.generativeai as genai
from google import genai
from google.genai import types
from google.genai.types import GenerateContentConfig, Part, SafetySetting
from langchain_google_genai import ChatGoogleGenerativeAI
from typing import Optional, List
from pydantic import BaseModel, Field, HttpUrl
from typing_extensions import Annotated, TypedDict

In [407]:
class InternationalFeatures(BaseModel):
    roaming: str
    throttle_speed: str
    intl_calling: str
    high_speed_data_gb: Optional[float]


class Fee(BaseModel):
    name: str
    amount: str
    additional_information: str


class Fees(BaseModel):
    additional_fees: List[Fee]
    one_time_fees: List[Fee]
    early_termination: List[Fee]
    regulatory_cost_recovery: List[Fee]
    government_taxes: List[Fee]
    federal_universal_fund_fee: List[Fee]


class SpeedProfile(BaseModel):
    network_type: str
    typical_download_speed: str
    typical_upload_speed: str
    typical_latency: str


class MobileHotspot(BaseModel):
    throttled_speed_5g: str
    throttled_speed: str
    premium_data_gb: Optional[float]


class Perk(BaseModel):
    description: str
    type: str


class Plan(BaseModel):
    plan_name: str
    unlimited_talk_text_data: bool
    international_features: InternationalFeatures
    fees: Fees
    speeds: List[SpeedProfile]
    upgrade_offer: Optional[str]
    mobile_hotspot: MobileHotspot
    perks_included: List[Perk]
    streaming_quality: Optional[str]
    price_lock_years: Optional[int]
    pricing_tier_price_per_line: Optional[float]
    additional_data_charges: Optional[str]
    pricing_tier_num_lines: Optional[int]
    data_included: Optional[str]
    monthly_price: Optional[float]
    network_access: List[str]
    pricing_tier_price_notes: List[str]
    bring_your_own_device_offer: Optional[str]
    premium_data: Optional[bool]
    pricing_tier_span_months: Optional[int]
    price_with_auto_pay: Optional[bool]
    connected_device_discount: Optional[str]
    discounts_and_bundles_link: Optional[str]


class CustomerSupport(BaseModel):
    phone_number: Optional[str]
    website: Optional[str]


class NetworkPolicy(BaseModel):
    title: Optional[str]
    policy_link: Optional[str]


class TelecomData(BaseModel):
    Provider: str
    plan: Plan
    customer_support: CustomerSupport
    network_policy: NetworkPolicy

In [409]:
import os
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = "<API KEY>"

In [554]:
f = open("straighttalk_platinum_unlimited.txt")
# f = open("straighttalk_platinum_unlimited.txt")
# f = open("tracphone_unlimited_talk.txt")

input_text_plan_details = f.read()
print(input_text_plan_details)


Plan Name: Platinum Unlimited
Provider: Straight Talk Wireless
Price: $65.00 per month (not an introductory rate)
Contract: No contract required
Activation Fee: $0.00
Provider Monthly Fees: $0.00
Early Termination Fee: $0.00
One-Time Fees at Purchase:

Federal Universal Service Fund: $0.37

Regulatory Cost Recovery: $0.13

Government Taxes: Varies by location

Discounts & Bundles:

Discounts and bundle options are available, including broadband bundled with other services such as video, phones, wireless, or using your own equipment (modem/router). Details can be found here.

Speeds Provided with Plan:

5G Ultra Wideband:

Typical Download Speed: 195-634 Mbps

Typical Upload Speed: 10-53 Mbps

Typical Latency: 37-57 ms

5G:

Typical Download Speed: 35-143 Mbps

Typical Upload Speed: 5-31 Mbps

Typical Latency: 42-64 ms

4G LTE:

Typical Download Speed: 11-75 Mbps

Typical Upload Speed: 1-13 Mbps

Typical Latency: 49-75 ms

Data Included: Unlimited data included with the monthly price; n

In [556]:
improved_system_message = """You are an assistant that converts broadband plan descriptions into structured JSON data.

- Use the Pydantic schema provided separately to map all fields correctly.
- Carefully extract relevant information from the input text.
- Populate all required fields according to the schema.
- Include optional fields only if the information is present.
- Use exact enum values where applicable.
- Maintain proper data types (numbers, strings, lists, booleans).
- Output strictly valid JSON conforming to the schema.
- Do not add explanations, comments, or any extra text—only output the JSON.
"""

In [558]:
# Initialize the model
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.1)
structured_llm = llm.with_structured_output(TelecomData)

messages = [
    ("system", improved_system_message),
    ("human", input_text_plan_details),
]
# Invoke the model with a query asking for structured information
catalog_result = structured_llm.invoke(messages)

In [559]:
catalog_result

TelecomData(Provider='Straight Talk Wireless', plan=Plan(plan_name='Platinum Unlimited', unlimited_talk_text_data=True, international_features=InternationalFeatures(roaming='N/A', throttle_speed='N/A', intl_calling='N/A', high_speed_data_gb=None), fees=Fees(additional_fees=[], one_time_fees=[], early_termination=[Fee(name='Early Termination Fee', amount='0.00', additional_information='Early Termination Fee')], regulatory_cost_recovery=[Fee(name='Regulatory Cost Recovery', amount='0.13', additional_information='Regulatory Cost Recovery')], government_taxes=[Fee(name='Government Taxes', amount='Varies by location', additional_information='Government Taxes')], federal_universal_fund_fee=[Fee(name='Federal Universal Service Fund', amount='0.37', additional_information='Federal Universal Service Fund')]), speeds=[SpeedProfile(network_type='5G Ultra Wideband', typical_download_speed='195-634 Mbps', typical_upload_speed='10-53 Mbps', typical_latency='37-57 ms'), SpeedProfile(network_type='5G'

In [560]:
# Convert the result (which is a Pydantic model instance) to a JSON string
json_output = json.dumps(catalog_result.model_dump(), indent=2)

# Print or return the JSON
print(json_output)


{
  "Provider": "Straight Talk Wireless",
  "plan": {
    "plan_name": "Platinum Unlimited",
    "unlimited_talk_text_data": true,
    "international_features": {
      "roaming": "N/A",
      "throttle_speed": "N/A",
      "intl_calling": "N/A",
      "high_speed_data_gb": null
    },
    "fees": {
      "additional_fees": [],
      "one_time_fees": [],
      "early_termination": [
        {
          "name": "Early Termination Fee",
          "amount": "0.00",
          "additional_information": "Early Termination Fee"
        }
      ],
      "regulatory_cost_recovery": [
        {
          "name": "Regulatory Cost Recovery",
          "amount": "0.13",
          "additional_information": "Regulatory Cost Recovery"
        }
      ],
      "government_taxes": [
        {
          "name": "Government Taxes",
          "amount": "Varies by location",
          "additional_information": "Government Taxes"
        }
      ],
      "federal_universal_fund_fee": [
        {
          "n

In [564]:
file_path = "straighttalk_platinum_unlimited.json"
with open(file_path, "w") as file:
    json.dump(json_output, file, indent=2)

In [566]:
from typing import List
from uuid import uuid4


def escape(s: str) -> str:
    return s.replace('"', '\\"').replace("'", "\\'")


def create_node(label: str, props: dict, uid_field: str = None) -> str:
    props_cleaned = {k: v for k, v in props.items() if v is not None}
    if uid_field and uid_field not in props_cleaned:
        props_cleaned[uid_field] = str(uuid4())
    prop_str = ', '.join(f'{k}: {repr(v)}' for k, v in props_cleaned.items())
    return f'MERGE (:{label} {{{prop_str}}})'


def create_rel(from_label: str, from_key: str, to_label: str, to_key: str, rel: str, rel_props=None) -> str:
    rel_props_str = ""
    if rel_props:
        rel_props_str = " {" + ", ".join(f"{k}: {repr(v)}" for k, v in rel_props.items()) + "}"
    return (
        f'MATCH (a:{from_label} {{{from_key}}}), (b:{to_label} {{{to_key}}})\n'
        f'MERGE (a)-[:{rel}{rel_props_str}]->(b)'
    )

def format_props_for_match(props: dict) -> str:
    # Remove None values
    filtered = {k: v for k, v in props.items() if v is not None}
    if not filtered:
        return ""
    
    # Build Cypher style props: {key1: 'value1', key2: 123, ...}
    props_list = []
    for k, v in filtered.items():
        if isinstance(v, str):
            # wrap strings in single quotes, escape inner quotes
            val = v.replace("'", "\\'")
            props_list.append(f"{k}: '{val}'")
        else:
            # for numbers, bools etc.
            props_list.append(f"{k}: {v}")
    return "{" + ", ".join(props_list) + "}"
    
def create_customersupport_network_rel(from_label, from_props, to_label, to_props, rel, rel_props=None):
    from_str = format_props_for_match(from_props)
    to_str = format_props_for_match(to_props)
    rel_props_str = ""
    if rel_props:
        filtered_rel_props = {k: v for k, v in rel_props.items() if v is not None}
        if filtered_rel_props:
            rel_props_str = " {" + ", ".join(f"{k}: {repr(v)}" for k, v in filtered_rel_props.items()) + "}"
    return f"MATCH (a:{from_label} {from_str}), (b:{to_label} {to_str})\nCREATE (a)-[:{rel}{rel_props_str}]->(b)" 
    
def generate_neo4j_queries(data: TelecomData) -> List[str]:
    queries = []

    # Provider
    queries.append(create_node("Provider", {"name": data.Provider}))

    # Plan
    plan_props = data.plan.dict(exclude={"international_features", "fees", "speeds", "mobile_hotspot", "perks_included"})
    queries.append(create_node("Plan", {"plan_name": plan_props["plan_name"], **plan_props}))

    queries.append(create_rel("Provider", f'name: {repr(data.Provider)}', "Plan", f'plan_name: {repr(data.plan.plan_name)}', "OFFERS"))

    # InternationalFeatures
    queries.append(create_node("InternationalFeatures", data.plan.international_features.dict()))
    queries.append(create_rel("Plan", f'plan_name: {repr(data.plan.plan_name)}', "InternationalFeatures", f'roaming: {repr(data.plan.international_features.roaming)}', "HAS_INTERNATIONAL_FEATURES"))

    # MobileHotspot
    queries.append(create_node("MobileHotspot", data.plan.mobile_hotspot.dict()))
    queries.append(create_rel("Plan", f'plan_name: {repr(data.plan.plan_name)}', "MobileHotspot", f'throttled_speed: {repr(data.plan.mobile_hotspot.throttled_speed)}', "HAS_MOBILE_HOTSPOT"))

    # SpeedProfiles
    for speed in data.plan.speeds:
        queries.append(create_node("SpeedProfile", speed.dict()))
        queries.append(create_rel("Plan", f'plan_name: {repr(data.plan.plan_name)}', "SpeedProfile", f'network_type: {repr(speed.network_type)}', "INCLUDES_SPEED_PROFILE"))

    # Perks
    for perk in data.plan.perks_included:
        queries.append(create_node("Perk", perk.dict()))
        queries.append(create_rel("Plan", f'plan_name: {repr(data.plan.plan_name)}', "Perk", f'description: {repr(perk.description)}', "INCLUDES_PERK"))

    # Fees
    for fee_type, fee_list in data.plan.fees.dict().items():
        for fee in fee_list:
            queries.append(create_node("Fee", {**fee, "type": fee_type}))
            queries.append(create_rel("Plan", f'plan_name: {repr(data.plan.plan_name)}', "Fee", f'name: {repr(fee["name"])}', "HAS_FEE", {"type": fee_type}))

    # Customer Support
    queries.append(create_node("CustomerSupport", data.customer_support.dict()))
    create_customersupport_network_rel(
        "Plan",
        {"plan_name": data.plan.plan_name},
        "CustomerSupport",
        {"phone_number": data.customer_support.phone_number},
        "HAS_CUSTOMER_SUPPORT"
    )
    # Network Policy
    queries.append(create_node("NetworkPolicy", data.network_policy.dict()))
    queries.append(
        create_customersupport_network_rel(
            "Plan",
            {"plan_name": data.plan.plan_name},
            "NetworkPolicy",
            {"title": data.network_policy.title},
            "HAS_NETWORK_POLICY"
        )
    )

    return queries


In [568]:
with open("straighttalk_platinum_unlimited.json") as f:
    plan_data = json.load(f)
print(plan_data)

{
  "Provider": "Straight Talk Wireless",
  "plan": {
    "plan_name": "Platinum Unlimited",
    "unlimited_talk_text_data": true,
    "international_features": {
      "roaming": "N/A",
      "throttle_speed": "N/A",
      "intl_calling": "N/A",
      "high_speed_data_gb": null
    },
    "fees": {
      "additional_fees": [],
      "one_time_fees": [],
      "early_termination": [
        {
          "name": "Early Termination Fee",
          "amount": "0.00",
          "additional_information": "Early Termination Fee"
        }
      ],
      "regulatory_cost_recovery": [
        {
          "name": "Regulatory Cost Recovery",
          "amount": "0.13",
          "additional_information": "Regulatory Cost Recovery"
        }
      ],
      "government_taxes": [
        {
          "name": "Government Taxes",
          "amount": "Varies by location",
          "additional_information": "Government Taxes"
        }
      ],
      "federal_universal_fund_fee": [
        {
          "n

In [570]:
plan_data

'{\n  "Provider": "Straight Talk Wireless",\n  "plan": {\n    "plan_name": "Platinum Unlimited",\n    "unlimited_talk_text_data": true,\n    "international_features": {\n      "roaming": "N/A",\n      "throttle_speed": "N/A",\n      "intl_calling": "N/A",\n      "high_speed_data_gb": null\n    },\n    "fees": {\n      "additional_fees": [],\n      "one_time_fees": [],\n      "early_termination": [\n        {\n          "name": "Early Termination Fee",\n          "amount": "0.00",\n          "additional_information": "Early Termination Fee"\n        }\n      ],\n      "regulatory_cost_recovery": [\n        {\n          "name": "Regulatory Cost Recovery",\n          "amount": "0.13",\n          "additional_information": "Regulatory Cost Recovery"\n        }\n      ],\n      "government_taxes": [\n        {\n          "name": "Government Taxes",\n          "amount": "Varies by location",\n          "additional_information": "Government Taxes"\n        }\n      ],\n      "federal_universal

In [572]:
from neo4j import GraphDatabase
from typing import List

class Neo4jUploader:
    def __init__(self, uri: str, user: str, password: str):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def run_queries(self, queries: List[str]):
        with self.driver.session() as session:
            for query in queries:
                try:
                    session.run(query)
                    print("✅ Executed:", query.split("\n")[0])  # show only the first line for brevity
                except Exception as e:
                    print("❌ Error executing query:", query)
                    print(e)

In [574]:
# Step 1: Parse JSON string or dict into the TelecomData model
if isinstance(plan_data, str):
    data = TelecomData.parse_raw(plan_data)
elif isinstance(plan_data, dict):
    data = TelecomData(**plan_data)
else:
    data = plan_data  # already a TelecomData model

cypher_queries = generate_neo4j_queries(data)
print(cypher_queries)

["MERGE (:Provider {name: 'Straight Talk Wireless'})", "MERGE (:Plan {plan_name: 'Platinum Unlimited', unlimited_talk_text_data: True, streaming_quality: 'N/A', additional_data_charges: 'no additional charges for extra data usage.', data_included: 'Unlimited data included with the monthly price', monthly_price: 65.0, network_access: ['5G Ultra Wideband', '5G', '4G LTE'], pricing_tier_price_notes: [], premium_data: True, price_with_auto_pay: False, discounts_and_bundles_link: 'Details can be found here.'})", "MATCH (a:Provider {name: 'Straight Talk Wireless'}), (b:Plan {plan_name: 'Platinum Unlimited'})\nMERGE (a)-[:OFFERS]->(b)", "MERGE (:InternationalFeatures {roaming: 'N/A', throttle_speed: 'N/A', intl_calling: 'N/A'})", "MATCH (a:Plan {plan_name: 'Platinum Unlimited'}), (b:InternationalFeatures {roaming: 'N/A'})\nMERGE (a)-[:HAS_INTERNATIONAL_FEATURES]->(b)", "MERGE (:MobileHotspot {throttled_speed_5g: 'N/A', throttled_speed: 'N/A'})", "MATCH (a:Plan {plan_name: 'Platinum Unlimited'

C:\Users\krish\AppData\Local\Temp\ipykernel_4536\1185265585.py:3: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  data = TelecomData.parse_raw(plan_data)
C:\Users\krish\AppData\Local\Temp\ipykernel_4536\1693282815.py:61: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  plan_props = data.plan.dict(exclude={"international_features", "fees", "speeds", "mobile_hotspot", "perks_included"})
C:\Users\krish\AppData\Local\Temp\ipykernel_4536\1693282815.py:67: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be 

In [577]:
# Replace with your Neo4j credentials
NEO4J_URI = "<NEO4J_URI>"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "<NEO4J_API_KEY>"

# driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

In [579]:
uploader = Neo4jUploader(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
try:
    uploader.run_queries(cypher_queries)
    uploader.close()
    print("Upload complete.")
finally:
    uploader.close()


✅ Executed: MERGE (:Provider {name: 'Straight Talk Wireless'})
✅ Executed: MERGE (:Plan {plan_name: 'Platinum Unlimited', unlimited_talk_text_data: True, streaming_quality: 'N/A', additional_data_charges: 'no additional charges for extra data usage.', data_included: 'Unlimited data included with the monthly price', monthly_price: 65.0, network_access: ['5G Ultra Wideband', '5G', '4G LTE'], pricing_tier_price_notes: [], premium_data: True, price_with_auto_pay: False, discounts_and_bundles_link: 'Details can be found here.'})
✅ Executed: MATCH (a:Provider {name: 'Straight Talk Wireless'}), (b:Plan {plan_name: 'Platinum Unlimited'})
✅ Executed: MERGE (:InternationalFeatures {roaming: 'N/A', throttle_speed: 'N/A', intl_calling: 'N/A'})
✅ Executed: MATCH (a:Plan {plan_name: 'Platinum Unlimited'}), (b:InternationalFeatures {roaming: 'N/A'})
✅ Executed: MERGE (:MobileHotspot {throttled_speed_5g: 'N/A', throttled_speed: 'N/A'})
✅ Executed: MATCH (a:Plan {plan_name: 'Platinum Unlimited'}), (b:M