In [None]:
import json
import subprocess
import re
import ollama

# Paths
input_file = "sri_lanka_safaris_3_items.json"
output_file = "sri_lanka_safaris_newTry.json"

# Load data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# DeepSeek prompt — strictly JSON only
deepseek_prompt = """
You are a travel-domain knowledge extractor for Sri Lanka safari data.

INPUT: A JSON object containing raw scraped safari information from TripAdvisor.

TASK: Return ONLY a valid JSON object in the exact following enriched format, without any extra text, comments, explanations, or thinking process.

{
  "title": "...",
  "image_links": [...],
  "rating": "...",
  "total_reviews": "...",
  "recommendation": "...",
  "type": "...",
  "travel_duration": "...",
  "description": "...",
  "starting_price": "...",
  "extracted_features": {
    "park_names": [...],
    "primary_wildlife_focus": "...",
    "safari_type_detail": "...",
    "suitability": "...",
    "duration_category": "...",
    "price_category": "...",
    "key_attractions": [...],
    "flexibility_notes": "...",
    "district": "...",
    "latitude": "...",
    "longitude": "..."
  }
}

Follow these rules:
- Preserve all original fields exactly unless fixing typos.
- Populate "extracted_features" using the given rules and your knowledge.
- Output MUST be valid JSON and nothing else.
"""

enriched_data = []

for idx, item in enumerate(data):
    print(f"Processing item {idx+1}/{len(data)}: {item.get('title', '')[:60]}")
    full_prompt = deepseek_prompt + "\n\nINPUT JSON:\n" + json.dumps(item, ensure_ascii=False)

    try:
        response = ollama.chat(
            model="deepseek-r1",
            messages=[
                {
                    "role": "system",
                    "content": full_prompt
                },
                # It's better to separate system and user messages if possible.
                # However, for this prompt-heavy approach, this is fine.
            ],
            options={"temperature": 0.0, "keep_alive": -1} # Adjust options as needed
        )
        output_text = response["message"]["content"].strip()
        # The Ollama library is more reliable and should return clean JSON
        enriched_item = json.loads(output_text)
        enriched_data.append(enriched_item)
        print(f"completely extracted {idx+1}/{len(data)}")
    except json.JSONDecodeError:
        print(f"❌ Error parsing JSON for: {item.get('title', '')}")
    except Exception as e:
        print(f"❌ An error occurred with Ollama for: {item.get('title', '')}: {e}")

# Save enriched data
if enriched_data:
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(enriched_data, f, ensure_ascii=False, indent=2)
    print(f"\n✅ Enriched data saved to {output_file}")
else:
    print("\n⚠ No enriched data to save.")

Processing item 1/3: Private Jeep Safari at Minneriya National Park to Visit Elep
❌ Error parsing JSON for: Private Jeep Safari at Minneriya National Park to Visit Elephants
Processing item 2/3: Udawalawe National Park | Private Safari Tour
❌ Error parsing JSON for: Udawalawe National Park | Private Safari Tour
Processing item 3/3: Ella: Transfer to Tangalle/Mirissa/Galle & Yala/Udawalawe Sa
