In [1]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import time # For handling rate limits
import json

# If your data is in a JSON file, load it like this:
with open('../2. Web Scraping/sri_lanka_safaris.json', 'r', encoding='utf-8') as f:
    safari_data = json.load(f)

In [2]:
load_dotenv()

client = OpenAI(
    base_url="https://openrouter.ai/api/v1", # Or DeepSeek's official API base URL if not using OpenRouter
    api_key=os.getenv("OPENROUTER_API_KEY")
)

MODEL_NAME = "deepseek/deepseek-r1-0528:free" # Or "deepseek-ai/deepseek-chat" or "deepseek-ai/deepseek-v2"

def extract_features_with_deepseek(safari_item_dict):
    """
    Sends a single safari item to DeepSeek to extract meaningful features.
    The prompt is designed to ask for a specific JSON output structure.
    """
    # Convert the safari item dictionary to a JSON string for the prompt
    item_json_string = json.dumps(safari_item_dict, indent=2)

    prompt = f"""
    You are an expert travel data analyst. Your task is to extract meaningful features from the provided safari tour data.
    Analyze the following safari tour JSON object and extract the specified features.
    
    Raw Safari Data:
    {item_json_string}
    
    Extract the following features in a JSON object format:
    - **park_names**: A list of national parks mentioned.
    - **primary_wildlife_focus**: The main animal or wildlife type highlighted (e.g., "Elephants", "Leopards", "Birds"). If none, use "General Wildlife".
    - **safari_type_detail**: More specific details about the tour type (e.g., "Private Jeep Safari", "Full-Day Safari", "Group Tour").
    - **suitability**: General suitability (e.g., "Family-Friendly", "Photography Tour", "Adventure", "General").
    - **duration_category**: Categorize duration as "Half-Day" (up to 5 hours), "Full-Day" (5-10 hours), or "Multi-Day" (more than 10 hours or explicit multiple days).
    - **price_category**: Categorize price as "Budget" (under $50), "Mid-Range" ($50-$150), "Premium" (over $150). (Adjust these ranges based on your data).
    - **key_attractions**: A list of specific attractions or highlights mentioned in the description or title (e.g., "Leopard density", "Scenic views", "Bird watching").
    - **flexibility_notes**: Any mention of flexibility or weather-dependent aspects.
    
    Ensure the output is a valid JSON object.
    """

    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "You are a helpful travel data analyst assistant that extracts structured information."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0, # Use low temperature for deterministic and consistent output
            response_format={"type": "json_object"} # Crucial for enforcing JSON output
        )
        # DeepSeek's API with response_format="json_object" should return valid JSON
        features_json_str = response.choices[0].message.content
        return json.loads(features_json_str)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON response from DeepSeek: {e}")
        print(f"DeepSeek's raw response content: {features_json_str}")
        return {"error": "JSON decoding failed", "raw_response": features_json_str}
    except Exception as e:
        # Catch the specific rate limit error for more precise handling if needed
        if "Rate limit exceeded" in str(e):
            print(f"RATE LIMIT HIT: {e}")
        print(f"An error occurred during DeepSeek API call: {e}")
        return {"error": str(e)}

In [None]:
processed_safari_data = []

try:
    with open('processed_safari_data.json', 'r', encoding='utf-8') as f:
        processed_safari_data = json.load(f)
    print(f"Loaded {len(processed_safari_data)} already processed items.")
except FileNotFoundError:
    print("No existing processed data found. Starting from scratch.")

# Keep track of which items have been processed
processed_titles = {item.get('title') for item in processed_safari_data if 'extracted_features' in item}

for i, item in enumerate(safari_data):
    if item.get('title') in processed_titles:
        print(f"Skipping already processed item {i+1}/{len(safari_data)}: {item.get('title', 'No Title')}")
        continue # Skip to the next item

    print(f"Processing item {i+1}/{len(safari_data)}: {item.get('title', 'No Title')}")
    
    # --- CRITICAL CHANGE HERE ---
    # Set the delay to at least 60 seconds to respect the 1 RPM limit
    # Add a buffer, e.g., 61 seconds, to be safe.
    if i > 0: # Don't sleep before the very first request
        print("Waiting 61 seconds to respect rate limit...")
        time.sleep(61) # Wait for 61 seconds
        print("Resuming processing.")
    
    extracted_features = extract_features_with_deepseek(item)
    
    if "error" not in extracted_features:
        item["extracted_features"] = extracted_features
        processed_safari_data.append(item) # Append only newly processed items if restarting
        print("Features extracted successfully.")
    else:
        print(f"Failed to extract features for this item: {extracted_features['error']}")
        item["extraction_error"] = extracted_features["error"]
        item["raw_deepseek_response_on_error"] = extracted_features.get("raw_response")
        processed_safari_data.append(item) # Still add to list, but mark as error

    # Save progress periodically, especially for long-running tasks
    if (i + 1) % 10 == 0 or (i + 1) == len(safari_data): # Save every 10 items or at the end
        output_filename = 'processed_safari_data.json'
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(processed_safari_data, f, indent=4, ensure_ascii=False)
        print(f"Progress saved to {output_filename}")

print(f"\nProcessing complete. Final processed data saved to {output_filename}")

No existing processed data found. Starting from scratch.
Processing item 1/1101: Private Jeep Safari at Minneriya National Park to Visit Elephants
Features extracted successfully.
Processing item 2/1101: Udawalawe National Park | Private Safari Tour
Waiting 61 seconds to respect rate limit...
Resuming processing.
Features extracted successfully.
Processing item 3/1101: Ella: Transfer to Tangalle/Mirissa/Galle & Yala/Udawalawe Safari
Waiting 61 seconds to respect rate limit...
Resuming processing.
RATE LIMIT HIT: Error code: 429 - {'error': {'message': 'Rate limit exceeded: limit_rpm/deepseek/deepseek-r1-0528/56462fa3-3f6f-4f2f-995d-16a85e4df69e. High demand for deepseek/deepseek-r1-0528:free on OpenRouter - limited to 1 requests per minute. Please retry shortly.', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '1', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1753545780000'}, 'provider_name': None}}, 'user_id': 'user_2ygm1lreZzpw6NY69wy7YdYPdVo'}
An error occurred du

In [None]:
print(processed_safari_data)

[{'title': 'Private Jeep Safari at Minneriya National Park to Visit Elephants', 'image_links': ['https://dynamic-media-cdn.tripadvisor.com/media/photo-o/2c/c8/cb/5e/caption.jpg?w=500&h=400&s=1'], 'rating': '4.9', 'total_reviews': '222', 'recommendation': 'Recommended by 98% of travelers', 'type': '4WD Tours', 'travel_duration': '3–4 hours', 'description': 'Attention: According to the weather conditions, Minneriya National Park, and Kaudulla NP are the best these days. You can…', 'starting_price': '$35', 'extracted_features': {'park_names': ['Minneriya National Park', 'Kaudulla NP'], 'primary_wildlife_focus': 'Elephants', 'safari_type_detail': 'Private Jeep Safari', 'suitability': 'General', 'duration_category': 'Half-Day', 'price_category': 'Budget', 'key_attractions': ['Elephant sightings'], 'flexibility_notes': 'Tour selection adjusted according to weather conditions (Minneriya/Kaudulla NP)'}}, {'title': 'Udawalawe National Park | Private Safari Tour', 'image_links': ['https://dynami