# Test Crawl4AI Web Search Handler

This notebook tests the `search_movie_plot_info`, `search_movie_curiosities_info`, and `search_movie_reviews` functions from `crawl4ai_web_scraping.py`.

In [1]:
import sys
import asyncio
import platform
import nest_asyncio

# Set the policy BEFORE applying nest_asyncio
if platform.system() == 'Windows':
    try:
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    except Exception as e:
        print(f"Note: Could not set WindowsSelectorEventLoopPolicy - {e}")

# Apply nest_asyncio to allow running async functions in Jupyter
nest_asyncio.apply()

# Add the project root to sys.path for module imports
#sys.path.append("D:\\Internship\\recsys\\microservices\\microservices")
sys.path.append("D:\\Internship\\recsys\\back_end\\web_search")

# Import the async functions
from crawl4ai_web_scraping import (
    search_movie_plot_info,
    search_movie_curiosities_info,
    search_movie_reviews,
    save_movie_to_txt,
)

## Define Test Parameters

In [2]:
test_movie_title = "The Matrix" # Choose a well-known movie for testing
num_plot_results = 2       # Number of results for plot search
num_curiosity_results = 3  # Number of results for curiosities search
num_review_results = 3     # Number of results for review search

## Test `search_movie_plot_info`

In [3]:
print(f"--- Searching for plot of '{test_movie_title}' (max {num_plot_results} results) ---")
# Use await directly (enabled by nest_asyncio)
plot_results = await search_movie_plot_info(test_movie_title, num_results=num_plot_results)

if plot_results:
    print(f"Found {len(plot_results)} plot results:")
    for i, item in enumerate(plot_results):
        print(f"\nResult {i+1}:")
        assert isinstance(item, dict), f"Result {i+1} is not a dictionary."
        assert 'url' in item, f"'url' key missing in result {i+1}."
        assert 'content' in item, f"'content' key missing in result {i+1}."
        print(f"  URL: {item['url']}")
        print(f"  Content Snippet (Plot): {item['content'][:200]}...") # Print a shorter snippet
else:
    print("No plot results found or an error occurred.")

--- Searching for plot of 'The Matrix' (max 2 results) ---
Attempting to fetch plot from IMDb: https://www.imdb.com/title/tt0133093/plotsummary/
[INIT].... → Crawl4AI 0.6.1
[FETCH]... ↓ https://www.imdb.com/title/tt0133093/plotsummary/                                                    | ✓ | ⏱: 1.17s
[SCRAPE].. ◆ https://www.imdb.com/title/tt0133093/plotsummary/                                                    | ✓ | ⏱: 0.05s
[COMPLETE] ● https://www.imdb.com/title/tt0133093/plotsummary/                                                    | ✓ | ⏱: 1.23s
Found section with title: 'Synopsis'
Successfully extracted plot using Description/Synopsis section strategy (UL/LI).
Successfully extracted plot from IMDb.
Found 1 plot results:

Result 1:
  URL: https://www.imdb.com/title/tt0133093/plotsummary/
  Content Snippet (Plot): In 1999, in an unnamed city, Computer programmer Thomas Anderson (Keanu Reeves) is secretly a hacker known as "Neo". He is restless, eager and driven to learn the mean

## Test `search_movie_curiosities_info`

In [4]:
print(f"\n--- Searching for curiosities/info on '{test_movie_title}' (max {num_curiosity_results} results) ---")
# Use await directly (enabled by nest_asyncio)
curiosity_results = await search_movie_curiosities_info(test_movie_title, num_results=num_curiosity_results)

if curiosity_results:
    print(f"Found {len(curiosity_results)} curiosity/info results:")
    for i, item in enumerate(curiosity_results):
        print(f"\nResult {i+1}:")
        assert isinstance(item, dict), f"Result {i+1} is not a dictionary."
        assert 'url' in item, f"'url' key missing in result {i+1}."
        assert 'content' in item, f"'content' key missing in result {i+1}."
        print(f"  URL: {item['url']}")
        print(f"  Content Snippet (Curiosity/Info): {item['content'][:200]}...") # Print a shorter snippet
else:
    print("No curiosity/info results found or an error occurred.")


--- Searching for curiosities/info on 'The Matrix' (max 3 results) ---
Attempting to find trivia on triviaforyou.com with query: "The Matrix" trivia site:triviaforyou.com
Found potential triviaforyou.com URL: https://triviaforyou.com/matrix-trivia/
[INIT].... → Crawl4AI 0.6.1
[FETCH]... ↓ https://triviaforyou.com/matrix-trivia/                                                              | ✓ | ⏱: 0.71s
[SCRAPE].. ◆ https://triviaforyou.com/matrix-trivia/                                                              | ✓ | ⏱: 0.02s
[COMPLETE] ● https://triviaforyou.com/matrix-trivia/                                                              | ✓ | ⏱: 0.74s
Found 'entry-content' div on triviaforyou.com.
Successfully extracted trivia from triviaforyou.com.
Found 1 curiosity/info results:

Result 1:
  URL: https://triviaforyou.com/matrix-trivia/
  Content Snippet (Curiosity/Info): The Matrix is one of the most famous movies of all time. Released in 1999, it changed how people think about 

## Test `search_movie_reviews`

In [5]:
print(f"\n--- Searching for reviews of '{test_movie_title}' (max {num_review_results} results) ---")
# Use await directly (enabled by nest_asyncio)
review_results = await search_movie_reviews(test_movie_title, num_results=num_review_results)

if review_results:
    print(f"Found {len(review_results)} review results:")
    for i, item in enumerate(review_results):
        print(f"\nResult {i+1}:")
        assert isinstance(item, dict), f"Result {i+1} is not a dictionary."
        assert 'url' in item, f"'url' key missing in result {i+1}."
        assert 'content' in item, f"'content' key missing in result {i+1}."
        print(f"  URL: {item['url']}")
        print(f"  Content Snippet (Review): {item['content'][:200]}...") # Print a shorter snippet
else:
    print("No review results found or an error occurred.")


--- Searching for reviews of 'The Matrix' (max 3 results) ---
Attempting to fetch reviews from IMDb: https://www.imdb.com/title/tt0133093/reviews/
[INIT].... → Crawl4AI 0.6.1
[FETCH]... ↓ https://www.imdb.com/title/tt0133093/reviews/                                                        | ✓ | ⏱: 0.89s
[SCRAPE].. ◆ https://www.imdb.com/title/tt0133093/reviews/                                                        | ✓ | ⏱: 0.10s
[COMPLETE] ● https://www.imdb.com/title/tt0133093/reviews/                                                        | ✓ | ⏱: 0.99s
Found 94 potential review containers using combined strategies on IMDb page.
Using fallback text extraction for container: 10
/
10
Just wow
When this came out, I was living with a roommate. He went out and saw it, came home...
Using fallback text extraction for container: When this came out, I was living with a roommate. He went out and saw it, came home and said, "Dude,...
Using fallback text extraction for container: 10
/
10
Ah yes

## Test `save_movie_to_txt`

This will now save plot, curiosities, and reviews.

In [6]:
output_filename = f"./{test_movie_title.replace(' ', '_').lower()}_data.txt"
print(f"\n--- Saving all data for '{test_movie_title}' to '{output_filename}' ---")
try:
    # save_movie_to_txt runs the async functions internally using asyncio.run
    save_movie_to_txt(test_movie_title, output_filename, overwrite=True)
    print(f"Check the file '{output_filename}' for the combined results.")
except Exception as e:
    print(f"An error occurred during saving: {e}")


--- Saving all data for 'The Matrix' to './the_matrix_data.txt' ---
Attempting to fetch plot from IMDb: https://www.imdb.com/title/tt0133093/plotsummary/
[INIT].... → Crawl4AI 0.6.1
Attempting to find trivia on triviaforyou.com with query: "The Matrix" trivia site:triviaforyou.com
Rate limit detected on attempt 1. Retrying in 5.0 seconds...
Rate limit detected on attempt 1. Retrying in 5.0 seconds...
[FETCH]... ↓ https://www.imdb.com/title/tt0133093/plotsummary/                                                    | ✓ | ⏱: 2.76s
[SCRAPE].. ◆ https://www.imdb.com/title/tt0133093/plotsummary/                                                    | ✓ | ⏱: 0.07s
[COMPLETE] ● https://www.imdb.com/title/tt0133093/plotsummary/                                                    | ✓ | ⏱: 2.84s
Found section with title: 'Synopsis'
Successfully extracted plot using Description/Synopsis section strategy (UL/LI).
Successfully extracted plot from IMDb.
Rate limit detected on attempt 2. Retrying in 10.0 