In [2]:
file_path = 'sri_lanka_safaris_5_items.json'

In [None]:
import json
import time
from ollama import Client
import re

# === Load input JSON file ===
with open(file_path, 'r', encoding='utf-8') as f:
    safari_data = json.load(f)

# === Initialize Ollama client ===
client = Client()

# === Prompt Template ===
def build_prompt(item):
    return f"""
You are an expert travel NLP assistant. Given the following tour information, extract the following features in a clean **JSON object format only**, without any explanation or extra text.

Required fields:
- park_names: A list of national parks mentioned.
- primary_wildlife_focus: The main animal or wildlife type highlighted (e.g., "Elephants", "Leopards", "Birds"). If none, use "General Wildlife".
- safari_type_detail: Specific details about the tour type (e.g., "Private Jeep Safari", "Full-Day Safari", "Group Tour").
- suitability: General suitability (e.g., "Family-Friendly", "Photography Tour", "Adventure", "General").
- duration_category: Categorize duration as "Half-Day" (up to 5 hours), "Full-Day" (5–10 hours), or "Multi-Day" (more than 10 hours or multiple days).
- price_category: Categorize price as "Budget" (under $50), "Mid-Range" ($50–$150), "Premium" (over $150).
- key_attractions: A list of specific attractions or highlights mentioned.
- flexibility_notes: Any mention of flexibility or weather-dependent aspects.

Tour Information:
Title: {item.get("title", "")}
Duration: {item.get("travel_duration", "")}
Description: {item.get("description", "")}
Starting Price: {item.get("starting_price", "")}

Important: Output ONLY a valid JSON object. Do NOT explain your reasoning. Do NOT include markdown, backticks, or notes.
"""


# === Function to extract features ===
def extract_features(item, retries=2):
    prompt = build_prompt(item)
    for attempt in range(retries):
        try:
            response = client.chat(model='deepseek-r1', messages=[
                {"role": "user", "content": prompt}
            ])
            raw_output = response['message']['content'].strip()

            # DEBUG: print output for inspection
            print("\n[DEBUG] Raw model output:")
            print(raw_output[:300])  # just show first 300 chars

            # Simple cleanup if response is not valid JSON
            cleaned_output = re.sub(r'```json|```', '', raw_output).strip()

            # Try loading as JSON
            return json.loads(cleaned_output)

        except json.JSONDecodeError as e:
            print(f"[ERROR] JSONDecodeError on attempt {attempt+1}: {e}")
            print(f"[DEBUG] Raw response that failed:\n{raw_output}\n")
            time.sleep(1)
        except Exception as e:
            print(f"[ERROR] Unexpected error: {e}")
            time.sleep(1)

    return None  # Give up after retries

# === Main Loop ===
enriched_data = []
for idx, item in enumerate(safari_data):
    print(f"Processing item {idx+1}/{len(safari_data)}: {item.get('title', '')[:60]}")
    features = extract_features(item)
    if features:
        item['extracted_features'] = features
        enriched_data.append(item)
    time.sleep(1.2)  # slight delay to avoid overwhelming Ollama backend

# === Save final enriched JSON ===
with open('sri_lanka_safaries_enriched.json', 'w', encoding='utf-8') as f:
    json.dump(enriched_data, f, indent=2, ensure_ascii=False)

print("✅ Extraction complete. Output saved to 'sri_lanka_safaries_10_items_enriched.json'.")

Processing item 1/10: Private Jeep Safari at Minneriya National Park to Visit Elep

[DEBUG] Raw model output:
<think>
Okay, let me process this request carefully. The user wants a clean JSON output with specific tour features extracted from the given information.

First, I'm identifying all required fields to extract:
- park_names: clearly mentions Minneriya National Park and Kaudulla NP in the description

[ERROR] JSONDecodeError on attempt 1: Expecting value: line 1 column 1 (char 0)
[DEBUG] Raw response that failed:
<think>
Okay, let me process this request carefully. The user wants a clean JSON output with specific tour features extracted from the given information.

First, I'm identifying all required fields to extract:
- park_names: clearly mentions Minneriya National Park and Kaudulla NP in the description
- primary_wildlife_focus: straightforward - it specifically says "Visit Elephants"
- safari_type_detail: the title indicates a Private Jeep Safari
- suitability: not explicitl

In [5]:
import json
import time
import re
from ollama import Client

# === Load input JSON file ===
with open('sri_lanka_safaris_5_items.json', 'r', encoding='utf-8') as f:
    safari_data = json.load(f)

# === Connect to Ollama ===
client = Client()

# === Prompt Template ===
def build_prompt(item):
    return f"""
You are an expert travel NLP assistant. Your task is to extract the following fields in valid JSON format from the provided tour information.

Required fields:
- park_names: A list of national parks mentioned.
- primary_wildlife_focus: The main animal or wildlife type highlighted (e.g., "Elephants", "Leopards", "Birds"). If none, use "General Wildlife".
- safari_type_detail: Specific details about the tour type (e.g., "Private Jeep Safari", "Full-Day Safari", "Group Tour").
- suitability: General suitability (e.g., "Family-Friendly", "Photography Tour", "Adventure", "General").
- duration_category: Categorize duration as "Half-Day" (up to 5 hours), "Full-Day" (5–10 hours), or "Multi-Day" (more than 10 hours or multiple days).
- price_category: Categorize price as "Budget" (under $50), "Mid-Range" ($50–$150), or "Premium" (over $150).
- key_attractions: A list of specific attractions or highlights mentioned.
- flexibility_notes: Any mention of flexibility or weather-dependent aspects.

Tour Information:
Title: {item.get("title", "")}
Duration: {item.get("travel_duration", "")}
Description: {item.get("description", "")}
Starting Price: {item.get("starting_price", "")}

Important: Respond ONLY with a valid JSON object. Do NOT include explanations, markdown, or any other text.
"""

# === Extract first valid JSON object from text ===
def extract_json_block(text):
    try:
        json_match = re.search(r'\{.*\}', text, re.DOTALL)
        if json_match:
            return json.loads(json_match.group())
    except json.JSONDecodeError as e:
        print("[ERROR] JSON decoding failed:", e)
    return None

# === Core Extraction Function ===
def extract_features(item, retries=2):
    prompt = build_prompt(item)
    for attempt in range(retries):
        try:
            response = client.chat(model='deepseek-r1', messages=[
                {"role": "user", "content": prompt}
            ])
            raw_output = response['message']['content'].strip()

            # Extract and parse valid JSON object
            features = extract_json_block(raw_output)
            if features:
                # Normalize types if any field returned as a list
                for key in ['safari_type_detail', 'suitability', 'flexibility_notes']:
                    if isinstance(features.get(key), list):
                        features[key] = features[key][0] if features[key] else ""

                return features

        except Exception as e:
            print(f"[ERROR] Extraction failed: {e}")
        time.sleep(1)  # slight delay between retries

    return None

# === Main Loop ===
enriched_data = []

for idx, item in enumerate(safari_data):
    print(f"\n🔍 Processing item {idx+1}/{len(safari_data)}: {item.get('title', '')[:80]}")
    features = extract_features(item)

    if features:
        item['extracted_features'] = features
        enriched_data.append(item)
        print("✅ Extracted Features:", features)
    else:
        print("❌ Failed to extract features. Skipping this item.")

    time.sleep(1.5)

# === Save enriched output ===
with open('sri_lanka_safaries_enriched.json', 'w', encoding='utf-8') as f:
    json.dump(enriched_data, f, indent=2, ensure_ascii=False)

print("\n✅ All items processed. Enriched data saved to 'sri_lanka_safaries_enriched_5_items.json'.")



🔍 Processing item 1/3: Private Jeep Safari at Minneriya National Park to Visit Elephants
✅ Extracted Features: {'park_names': ['Minneriya National Park'], 'primary_wildlife_focus': 'Elephants', 'safari_type_detail': 'Private Jeep Safari', 'suitability': 'General', 'duration_category': 'Full-Day', 'price_category': 'Budget', 'key_attractions': [], 'flexibility_notes': 'Weather-dependent aspects'}

🔍 Processing item 2/3: Udawalawe National Park | Private Safari Tour
✅ Extracted Features: {'park_names': ['Udawalawe National Park'], 'primary_wildlife_focus': 'Elephants', 'safari_type_detail': 'Private Safari Tour', 'suitability': 'General', 'duration_category': 'Full-Day', 'price_category': 'Budget', 'key_attractions': ['Elephants', 'Leopards', 'Jungle Cats', 'Monkeys'], 'flexibility_notes': ''}

🔍 Processing item 3/3: Ella: Transfer to Tangalle/Mirissa/Galle & Yala/Udawalawe Safari
✅ Extracted Features: {'park_names': ['Yala National Park', 'Udawalawe National Park'], 'primary_wildlife_f