* code conversion: https://claude.ai/public/artifacts/f6064d7b-a650-403a-8a3a-3ebe51d55310

# 3.2.2 Indexing TMDB Movies
* load data from tmdb.json to open-search

In [2]:
!uv pip list

[2mUsing Python 3.12.10 environment at: /home/anon-labs/Documents/projects/search-nepali/.venv[0m
Package                 Version
----------------------- -----------
aiohappyeyeballs        2.6.1
aiohttp                 3.11.18
aiosignal               1.3.2
annotated-types         0.7.0
anyio                   3.7.1
asttokens               3.0.0
attrs                   25.3.0
blinker                 1.9.0
certifi                 2025.4.26
charset-normalizer      3.4.2
click                   8.1.8
comm                    0.2.2
datasets                3.5.1
debugpy                 1.8.14
decorator               5.2.1
dill                    0.3.8
events                  0.5
executing               2.2.0
fastapi                 0.104.1
filelock                3.18.0
flask                   3.1.0
flask-cors              5.0.1
frozenlist              1.6.0
fsspec                  2025.3.0
h11                     0.16.0
huggingface-hub         0.31.2
idna                    3.10
ipykernel

In [6]:
import json
from opensearchpy import OpenSearch
import os
from dotenv import load_dotenv

def extract():
    f = open('tmdb.json')
    if f:
        return json.loads(f.read())        
    return {}

def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    # Load environment variables from .env file
    load_dotenv()
    
    # OpenSearch connection setup
    host = 'localhost'
    port = 9200
    auth = ('admin', os.getenv('OPENSEARCH_INITIAL_ADMIN_PASSWORD'))  # For testing only
    
    # Create the client with SSL/TLS enabled
    client = OpenSearch(
        hosts = [{'host': host, 'port': port}],
        http_compress = True,  # enables gzip compression for request bodies
        http_auth = auth,
        use_ssl = True,
        verify_certs = False,  # Set to True in production with proper certs
        ssl_assert_hostname = False,
        ssl_show_warn = False
    )
    
    # Index settings
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysisSettings,
            }
        }
    }
    
    if mappingSettings:
        settings['mappings'] = mappingSettings
    
    # Delete index if it exists
    try:
        client.indices.delete(index='tmdb')
        print("Deleted existing index")
    except:
        pass  # Index might not exist yet
    
    # Create new index with settings
    response = client.indices.create('tmdb', body=settings)
    print("Created index:", response)
    
    print("Building bulk request...")
    bulk_data = []
    for id, movie in movieDict.items():  # Using items() instead of iteritems() for Python 3
        # Add the indexing command
        bulk_data.append({
            "index": {
                "_index": "tmdb",
                "_id": movie["id"]
            }
        })
        # Add the document to index
        bulk_data.append(movie)
    
    # Only perform bulk operation if there's data to index
    if bulk_data:
        print("Indexing documents...")
        response = client.bulk(body=bulk_data)
        
        # Check if there were any errors
        if response.get('errors', False):
            print("Errors during bulk indexing:", response)
        else:
            print(f"Successfully indexed {len(bulk_data)//2} documents")
    else:
        print("No documents to index")

# Test the function
if __name__ == "__main__":
    movieDict = extract()
    reindex(movieDict=movieDict)

Deleted existing index
Created index: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'tmdb'}
Building bulk request...
Indexing documents...
Successfully indexed 3051 documents


# 3.2.3 Basic Searching

In [3]:
import json
from opensearchpy import OpenSearch
import os
from dotenv import load_dotenv

def search(query):
    # Load environment variables from .env file
    load_dotenv()
    
    # OpenSearch connection setup
    host = 'localhost'
    port = 9200
    auth = ('admin', os.getenv('OPENSEARCH_INITIAL_ADMIN_PASSWORD'))  # For testing only
    
    # Create the client with SSL/TLS enabled
    client = OpenSearch(
        hosts = [{'host': host, 'port': port}],
        http_compress = True,  # enables gzip compression for request bodies
        http_auth = auth,
        use_ssl = True,
        verify_certs = False,  # Set to True in production with proper certs
        ssl_assert_hostname = False,
        ssl_show_warn = False
    )
    
    # Perform search using the OpenSearch client
    response = client.search(
        body=query,
        index='tmdb'
    )
    
    # Extract search hits
    search_hits = response['hits']
    
    # Print results in a formatted table
    print("Num\tRelevance Score\t\tMovie Title\t\tOverview")
    for idx, hit in enumerate(search_hits['hits']):
        print(f"{idx + 1}\t{hit['_score']}\t\t{hit['_source']['title']}")
    
    # Return the raw response for further processing if needed
    return response

if __name__ == "__main__":
    users_search = 'basketball with cartoon aliens'
    query = {
        'query': {
            'multi_match': { 
                'query': users_search,  # The user's search string
                'fields': ['title^10', 'overview'],  # Fields to search, with title boosted
            },
        },
        'size': 100  # Number of results to return
    }
    search(query)

Num	Relevance Score		Movie Title		Overview
1	38.895134		Aliens
2	33.504898		The Basketball Diaries
3	32.418274		Cowboys & Aliens
4	27.790558		Monsters vs Aliens
5	24.319012		Aliens vs Predator: Requiem
6	24.319012		Aliens in the Attic
7	20.555042		Dances with Wolves
8	20.555042		Friends with Benefits
9	20.555042		Fire with Fire
10	20.555042		Friends with Kids
11	17.987345		Interview with the Vampire
12	17.987345		From Russia With Love
13	17.987345		Gone with the Wind
14	17.987345		Just Go With It
15	17.987345		My Week with Marilyn
16	17.987345		From Paris with Love
17	17.987345		Trouble with the Curve
18	17.987345		Sleeping with the Enemy
19	17.987345		Hobo with a Shotgun
20	17.987345		To Rome with Love
21	15.989916		Die Hard: With a Vengeance
22	15.989916		Girl with a Pearl Earring
23	15.989916		Fun with Dick and Jane
24	14.391762		The Girl with the Dragon Tattoo
25	14.391762		The Life Aquatic With Steve Zissou
26	14.391762		Twin Peaks: Fire Walk with Me
27	14.391762		You Don't Mess W

# 2.3.1 Query Validation API

In [4]:
import json
from opensearchpy import OpenSearch
import os
from dotenv import load_dotenv

def validate_query(query, explain=True):
    # Load environment variables from .env file
    load_dotenv()
    
    # OpenSearch connection setup
    host = 'localhost'
    port = 9200
    auth = ('admin', os.getenv('OPENSEARCH_INITIAL_ADMIN_PASSWORD'))  # For testing only
    
    # Create the client with SSL/TLS enabled
    client = OpenSearch(
        hosts = [{'host': host, 'port': port}],
        http_compress = True,  # enables gzip compression for request bodies
        http_auth = auth,
        use_ssl = True,
        verify_certs = False,  # Set to True in production with proper certs
        ssl_assert_hostname = False,
        ssl_show_warn = False
    )
    
    # Validate the query using the OpenSearch client's indices.validate_query method
    response = client.indices.validate_query(
        body=query,
        index='tmdb',
        explain=explain
    )
    
    print(response)
    return response

if __name__ == "__main__":
    users_search = 'basketball with cartoon aliens'
    query = {
        'query': {
            'multi_match': { 
                'query': users_search,  # User's query
                'fields': ['title^10', 'overview']
            }
        }
    }
    
    validate_query(query)


{'_shards': {'total': 1, 'successful': 1, 'failed': 0}, 'valid': True, 'explanations': [{'index': 'tmdb', 'valid': True, 'explanation': '((overview:basketball overview:with overview:cartoon overview:aliens) | (title:basketball title:with title:cartoon title:aliens)^10.0)'}]}


# 2.3.3 Debugging Analysis

In [5]:
import json
import yaml
from opensearchpy import OpenSearch
import os
from dotenv import load_dotenv

def analyze_text(index, field, text):
    """
    Analyze how the specified text would be tokenized for a given field in the index.
    This helps understand:
    (1) What tokens are placed in the search engine
    (2) What the search engine attempts to match exactly
    """
    # Load environment variables from .env file
    load_dotenv()
    
    # OpenSearch connection setup
    host = 'localhost'
    port = 9200
    auth = ('admin', os.getenv('OPENSEARCH_INITIAL_ADMIN_PASSWORD'))  # For testing only
    
    # Create the client with SSL/TLS enabled
    client = OpenSearch(
        hosts = [{'host': host, 'port': port}],
        http_compress = True,  # enables gzip compression for request bodies
        http_auth = auth,
        use_ssl = True,
        verify_certs = False,  # Set to True in production with proper certs
        ssl_assert_hostname = False,
        ssl_show_warn = False
    )
    
    # Build the analyze request body
    body = {
        "field": field,
        "text": text
    }
    
    # Perform the analysis
    response = client.indices.analyze(
        index=index,
        body=body
    )
    
    # Format and print the response as YAML for better readability
    yaml_response = yaml.dump(response, default_flow_style=False)
    print(yaml_response)
    
    # Also print a more concise summary of just the tokens
    print("\nTokens generated:")
    for token in response['tokens']:
        print(f"  {token['token']} (position: {token['position']}, start: {token['start_offset']}, end: {token['end_offset']})")
    
    return response

def get_field_mapping(index, field):
    """
    Get the mapping details for a specific field in an index
    """
    # Load environment variables from .env file
    load_dotenv()
    
    # OpenSearch connection setup
    host = 'localhost'
    port = 9200
    auth = ('admin', os.getenv('OPENSEARCH_INITIAL_ADMIN_PASSWORD'))  # For testing only
    
    # Create the client with SSL/TLS enabled
    client = OpenSearch(
        hosts = [{'host': host, 'port': port}],
        http_compress = True,  # enables gzip compression for request bodies
        http_auth = auth,
        use_ssl = True,
        verify_certs = False,  # Set to True in production with proper certs
        ssl_assert_hostname = False,
        ssl_show_warn = False
    )
    
    # Get the field mapping
    response = client.indices.get_field_mapping(
        index=index,
        fields=field
    )
    
    # Format and print the response as YAML for better readability
    yaml_response = yaml.dump(response, default_flow_style=False)
    print(yaml_response)
    
    return response

if __name__ == "__main__":
    # Example usage:
    
    # 1. Analyze how "Fire with Fire" would be tokenized in the title field
    print("ANALYZING TEXT 'Fire with Fire' IN TITLE FIELD:")
    analyze_text("tmdb", "title", "Fire with Fire")
    
    # 2. Optionally, get the mapping details for the title field
    print("\nGETTING MAPPING FOR TITLE FIELD:")
    get_field_mapping("tmdb", "title")

ANALYZING TEXT 'Fire with Fire' IN TITLE FIELD:
tokens:
- end_offset: 4
  position: 0
  start_offset: 0
  token: fire
  type: <ALPHANUM>
- end_offset: 9
  position: 1
  start_offset: 5
  token: with
  type: <ALPHANUM>
- end_offset: 14
  position: 2
  start_offset: 10
  token: fire
  type: <ALPHANUM>


Tokens generated:
  fire (position: 0, start: 0, end: 4)
  with (position: 1, start: 5, end: 9)
  fire (position: 2, start: 10, end: 14)

GETTING MAPPING FOR TITLE FIELD:
tmdb:
  mappings:
    title:
      full_name: title
      mapping:
        title:
          fields:
            keyword:
              ignore_above: 256
              type: keyword
          type: text



# 2.3.5 -- Solving The Matching Problem

In [18]:
import json
import yaml
from opensearchpy import OpenSearch
import os
from dotenv import load_dotenv

def extract():
    """Extract movie data from tmdb.json file"""
    f = open('tmdb.json')
    if f:
        return json.loads(f.read())        
    return {}

def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    """
    Reindex the movie data with specified mappings and settings
    """
    # Load environment variables from .env file
    load_dotenv()
    
    # OpenSearch connection setup
    host = 'localhost'
    port = 9200
    auth = ('admin', os.getenv('OPENSEARCH_INITIAL_ADMIN_PASSWORD'))  # For testing only
    
    # Create the client with SSL/TLS enabled
    client = OpenSearch(
        hosts = [{'host': host, 'port': port}],
        http_compress = True,  # enables gzip compression for request bodies
        http_auth = auth,
        use_ssl = True,
        verify_certs = False,  # Set to True in production with proper certs
        ssl_assert_hostname = False,
        ssl_show_warn = False
    )
    
    # Index settings
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysisSettings,
            }
        }
    }
    
    # Add mappings if provided
    if mappingSettings:
        # Convert format to match OpenSearch (remove document type layer for OpenSearch ≥ 2.0)
        # In OpenSearch, the mapping structure is flattened compared to older Elasticsearch
        if 'movie' in mappingSettings:
            settings['mappings'] = mappingSettings['movie']
        else:
            settings['mappings'] = mappingSettings
    
    # Delete index if it exists
    try:
        client.indices.delete(index='tmdb')
        print("Deleted existing index")
    except:
        pass  # Index might not exist yet
    
    # Create new index with settings
    response = client.indices.create('tmdb', body=settings)
    print("Created index with mappings:", response)
    
    print("Building bulk request...")
    bulk_data = []
    for id, movie in movieDict.items():  # Using items() instead of iteritems() for Python 3
        # Add the indexing command
        bulk_data.append({
            "index": {
                "_index": "tmdb",
                "_id": movie["id"]
            }
        })
        # Add the document to index
        bulk_data.append(movie)
    
    # Only perform bulk operation if there's data to index
    if bulk_data:
        print("Indexing documents...")
        response = client.bulk(body=bulk_data)
        
        # Check if there were any errors
        if response.get('errors', False):
            print("Errors during bulk indexing:", response)
        else:
            print(f"Successfully indexed {len(bulk_data)//2} documents")
    else:
        print("No documents to index")

def analyze_text(index, field, text):
    """
    Analyze how the specified text would be tokenized for a given field in the index.
    """
    # Load environment variables from .env file
    load_dotenv()
    
    # OpenSearch connection setup
    host = 'localhost'
    port = 9200
    auth = ('admin', os.getenv('OPENSEARCH_INITIAL_ADMIN_PASSWORD'))  # For testing only
    
    # Create the client with SSL/TLS enabled
    client = OpenSearch(
        hosts = [{'host': host, 'port': port}],
        http_compress = True,
        http_auth = auth,
        use_ssl = True,
        verify_certs = False,
        ssl_assert_hostname = False,
        ssl_show_warn = False
    )
    
    # Build the analyze request body
    body = {
        "field": field,
        "text": text
    }
    
    # Perform the analysis
    response = client.indices.analyze(
        index=index,
        body=body
    )
    
    # Format and print the response as YAML
    yaml_response = yaml.dump(response, default_flow_style=False)
    print(yaml_response)
    
    # Also print a concise summary of just the tokens
    print("\nTokens generated:")
    for token in response['tokens']:
        print(f"  {token['token']} (position: {token['position']}, start: {token['start_offset']}, end: {token['end_offset']})")
    
    return response

if __name__ == "__main__":
    # Define the mapping settings with proper text field types and analyzers
    # Note: In OpenSearch, 'text' is used instead of 'string' type
    mapping_settings = {
        'movie': {
            'properties': {
                'title': {
                    'type': 'text',  # 'string' is deprecated, use 'text' instead
                    'analyzer': 'english'
                },
                'overview': {
                    'type': 'text',  # 'string' is deprecated, use 'text' instead
                    'analyzer': 'english'
                }
            }
        }
    }
    
    # Extract movie data and reindex with proper mappings
    movie_dict = extract()
    reindex(mappingSettings=mapping_settings, movieDict=movie_dict)
    
    # Analyze how "Fire with Fire" would be tokenized with the new mapping
    print("\nAnalyzing 'Fire with Fire' with the new mappings:")
    analyze_text("tmdb", "title", "Fire with Fire")


Deleted existing index
Created index with mappings: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'tmdb'}
Building bulk request...
Indexing documents...
Successfully indexed 3051 documents

Analyzing 'Fire with Fire' with the new mappings:
tokens:
- end_offset: 4
  position: 0
  start_offset: 0
  token: fire
  type: <ALPHANUM>
- end_offset: 14
  position: 2
  start_offset: 10
  token: fire
  type: <ALPHANUM>


Tokens generated:
  fire (position: 0, start: 0, end: 4)
  fire (position: 2, start: 10, end: 14)


In [73]:
resp = requests.get('http://localhost:9200/tmdb/_analyze?field=title&format=yaml', 
                    data="Fire with Fire")
print resp.text

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



## Repeat the search

In [19]:
import json
from opensearchpy import OpenSearch
import os
from dotenv import load_dotenv

def extract():
    """Extract movie data from tmdb.json file"""
    f = open('tmdb.json')
    if f:
        return json.loads(f.read())        
    return {}

def setup_client():
    """Set up and return an OpenSearch client"""
    # Load environment variables from .env file
    load_dotenv()
    
    # OpenSearch connection setup
    host = 'localhost'
    port = 9200
    auth = ('admin', os.getenv('OPENSEARCH_INITIAL_ADMIN_PASSWORD'))  # For testing only
    
    # Create the client with SSL/TLS enabled
    client = OpenSearch(
        hosts = [{'host': host, 'port': port}],
        http_compress = True,  # enables gzip compression for request bodies
        http_auth = auth,
        use_ssl = True,
        verify_certs = False,  # Set to True in production with proper certs
        ssl_assert_hostname = False,
        ssl_show_warn = False
    )
    
    return client

def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    """
    Reindex the movie data with specified mappings and settings
    """
    client = setup_client()
    
    # Index settings
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysisSettings,
            }
        }
    }
    
    # Add mappings if provided
    if mappingSettings:
        # Convert format to match OpenSearch (remove document type layer for OpenSearch ≥ 2.0)
        if 'movie' in mappingSettings:
            settings['mappings'] = mappingSettings['movie']
        else:
            settings['mappings'] = mappingSettings
    
    # Delete index if it exists
    try:
        client.indices.delete(index='tmdb')
        print("Deleted existing index")
    except:
        pass  # Index might not exist yet
    
    # Create new index with settings
    response = client.indices.create('tmdb', body=settings)
    print("Created index with mappings:", response)
    
    print("Building bulk request...")
    bulk_data = []
    for id, movie in movieDict.items():
        # Add the indexing command
        bulk_data.append({
            "index": {
                "_index": "tmdb",
                "_id": movie["id"]
            }
        })
        # Add the document to index
        bulk_data.append(movie)
    
    # Only perform bulk operation if there's data to index
    if bulk_data:
        print("Indexing documents...")
        response = client.bulk(body=bulk_data)
        
        # Check if there were any errors
        if response.get('errors', False):
            print("Errors during bulk indexing:", response)
        else:
            print(f"Successfully indexed {len(bulk_data)//2} documents")
    else:
        print("No documents to index")

def search(query):
    """
    Search the tmdb index with the given query
    """
    client = setup_client()
    
    # Perform search using the OpenSearch client
    response = client.search(
        body=query,
        index='tmdb'
    )
    
    # Extract search hits
    search_hits = response['hits']
    
    # Print results in a formatted table
    print("\nSearch Results:")
    print("Num\tRelevance Score\t\tMovie Title\t\tOverview")
    for idx, hit in enumerate(search_hits['hits']):
        # Format the overview for better display (limit length)
        overview = hit['_source'].get('overview', '')
        if len(overview) > 50:
            overview = overview[:50] + "..."
            
        print(f"{idx + 1}\t{hit['_score']:.4f}\t\t{hit['_source']['title']}\t\t{overview}")
    
    print(f"\nTotal results: {search_hits['total']['value']}")
    
    # Return the raw response for further processing if needed
    return response

def run_search_demo():
    """Run a complete demo of indexing with proper mappings and searching"""
    # 1. Define the mapping settings with proper text field types and analyzers
    mapping_settings = {
        'properties': {
            'title': {
                'type': 'text',  # 'string' is deprecated, use 'text' instead
                'analyzer': 'english'
            },
            'overview': {
                'type': 'text',
                'analyzer': 'english'
            }
        }
    }
    
    # 2. Extract movie data and reindex with proper mappings
    movie_dict = extract()
    if movie_dict:
        reindex(mappingSettings=mapping_settings, movieDict=movie_dict)
        
        # 3. Perform the search
        users_search = 'basketball with cartoon aliens'
        query = {
            'query': {
                'multi_match': { 
                    'query': users_search,
                    'fields': ['title^10', 'overview'],  # title has 10x the weight
                },
            },
            'size': 100  # Number of results to return (as integer)
        }
        
        # 4. Execute search and get results
        print(f"\nSearching for: '{users_search}'")
        search(query)
        
        # 5. Show query explanation (optional)
        print("\nQuery explanation:")
        explain_query(query)
    else:
        print("No movie data found. Please ensure the tmdb.json file exists.")

def explain_query(query):
    """
    Explain how the query works with the current mappings
    """
    client = setup_client()
    users_search = query['query']['multi_match']['query']
    
    # Analyze the query terms with the title field analyzer
    analyze_body = {
        "field": "title",
        "text": users_search
    }
    
    # Perform the analysis
    response = client.indices.analyze(
        index='tmdb',
        body=analyze_body
    )
    
    print(f"\nHow the search term '{users_search}' is analyzed:")
    print("Tokens that will be matched against the index:")
    for token in response['tokens']:
        print(f"  - {token['token']}")
    
    print("\nNote: The English analyzer removes stopwords like 'with' and stems words to their root form.")
    print("The search will match documents where these tokens appear in either title (weighted higher) or overview.")

if __name__ == "__main__":
    run_search_demo()

Deleted existing index
Created index with mappings: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'tmdb'}
Building bulk request...
Indexing documents...
Successfully indexed 3051 documents

Searching for: 'basketball with cartoon aliens'

Search Results:
Num	Relevance Score		Movie Title		Overview
1	34.2826		Alien		During its return to the earth, commercial spacesh...
2	34.2826		Aliens		When Ripley's lifepod is found by a salvage crew o...
3	33.7776		The Basketball Diaries		Film adaptation of street tough Jim Carroll's epis...
4	27.6502		Cowboys & Aliens		A stranger stumbles into the desert town of Absolu...
5	19.9364		Aliens vs Predator: Requiem		A sequel to 2004's Alien vs. Predator, the iconic ...
6	19.9364		AVP: Alien vs. Predator		When scientists discover something in the Arctic t...
7	5.8666		Space Jam		Michael Jordan agrees to help the Looney Tunes pla...
8	3.2702		The Flintstones		Modern Stone Age family the Flintstones hit the bi...
9	3.0565		White Men Can't Jump

# 2.4.1	Decomposing Relevance Score With Lucene’s Explain

In [20]:
import json
from opensearchpy import OpenSearch
import os
from dotenv import load_dotenv

def extract():
    """Extract movie data from tmdb.json file"""
    f = open('tmdb.json')
    if f:
        return json.loads(f.read())        
    return {}

def setup_client():
    """Set up and return an OpenSearch client"""
    # Load environment variables from .env file
    load_dotenv()
    
    # OpenSearch connection setup
    host = 'localhost'
    port = 9200
    auth = ('admin', os.getenv('OPENSEARCH_INITIAL_ADMIN_PASSWORD'))  # For testing only
    
    # Create the client with SSL/TLS enabled
    client = OpenSearch(
        hosts = [{'host': host, 'port': port}],
        http_compress = True,  # enables gzip compression for request bodies
        http_auth = auth,
        use_ssl = True,
        verify_certs = False,  # Set to True in production with proper certs
        ssl_assert_hostname = False,
        ssl_show_warn = False
    )
    
    return client

def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    """
    Reindex the movie data with specified mappings and settings
    """
    client = setup_client()
    
    # Index settings
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysisSettings,
            }
        }
    }
    
    # Add mappings if provided
    if mappingSettings:
        # Convert format to match OpenSearch (remove document type layer for OpenSearch ≥ 2.0)
        if 'movie' in mappingSettings:
            settings['mappings'] = mappingSettings['movie']
        else:
            settings['mappings'] = mappingSettings
    
    # Delete index if it exists
    try:
        client.indices.delete(index='tmdb')
        print("Deleted existing index")
    except:
        pass  # Index might not exist yet
    
    # Create new index with settings
    response = client.indices.create('tmdb', body=settings)
    print("Created index with mappings:", response)
    
    print("Building bulk request...")
    bulk_data = []
    for id, movie in movieDict.items():
        # Add the indexing command
        bulk_data.append({
            "index": {
                "_index": "tmdb",
                "_id": movie["id"]
            }
        })
        # Add the document to index
        bulk_data.append(movie)
    
    # Only perform bulk operation if there's data to index
    if bulk_data:
        print("Indexing documents...")
        response = client.bulk(body=bulk_data)
        
        # Check if there were any errors
        if response.get('errors', False):
            print("Errors during bulk indexing:", response)
        else:
            print(f"Successfully indexed {len(bulk_data)//2} documents")
    else:
        print("No documents to index")

def search(query, display_count=5):
    """
    Search the tmdb index with the given query
    """
    client = setup_client()
    
    # Perform search using the OpenSearch client
    response = client.search(
        body=query,
        index='tmdb'
    )
    
    # Extract search hits
    search_hits = response['hits']
    
    # Print results in a formatted table
    print("\nSearch Results:")
    print("Num\tRelevance Score\t\tMovie Title")
    for idx, hit in enumerate(search_hits['hits']):
        if idx < display_count:  # Limit displayed results
            print(f"{idx + 1}\t{hit['_score']:.4f}\t\t{hit['_source']['title']}")
    
    total_hits = search_hits['total']['value'] if isinstance(search_hits['total'], dict) else search_hits['total']
    print(f"\nTotal results: {total_hits}")
    
    # Return the raw response for further processing
    return response

def simpler_explain(explanation, depth=0):
    """
    Recursively simplify Lucene's explanation into a more readable format.
    Returns a simplified explanation string.
    """
    indent = "  " * depth
    result = []
    
    # Extract the main contribution and description
    value = explanation.get('value', 0)
    description = explanation.get('description', 'Unknown')
    
    # Add this level's explanation
    result.append(f"{indent}{value:.4f} = {description}")
    
    # Process child explanations recursively
    if 'details' in explanation and explanation['details']:
        for detail in explanation['details']:
            result.append(simpler_explain(detail, depth + 1))
    
    return "\n".join(result)

def decompose_relevance_score(query_results, num_docs=3):
    """
    Decompose and explain the relevance scores for the top N results
    """
    if not query_results or 'hits' not in query_results or 'hits' not in query_results['hits']:
        print("No results to explain")
        return
    
    hits = query_results['hits']['hits']
    
    for i, hit in enumerate(hits):
        if i >= num_docs:
            break
            
        if '_explanation' in hit:
            print(f"\nExplain for: {hit['_source']['title']} (Score: {hit['_score']:.4f})")
            print("-" * 80)
            print(simpler_explain(hit['_explanation']))
            print("-" * 80)
        else:
            print(f"No explanation available for: {hit['_source']['title']}")

def run_search_workflow():
    """Run a complete workflow with search, explain, and ranking adjustment"""
    # 1. Define the mapping settings with proper text field types and analyzers
    mapping_settings = {
        'properties': {
            'title': {
                'type': 'text',
                'analyzer': 'english'
            },
            'overview': {
                'type': 'text',
                'analyzer': 'english'
            }
        }
    }
    
    # 2. Extract movie data and reindex with proper mappings
    movie_dict = extract()
    if movie_dict:
        reindex(mappingSettings=mapping_settings, movieDict=movie_dict)
        
        # 3. Set up the search parameters
        users_search = 'basketball with cartoon aliens'
        
        print("\n" + "="*80)
        print("STEP 1: INITIAL SEARCH")
        print("="*80)
        
        # Initial search query
        query = {
            'query': {
                'multi_match': { 
                    'query': users_search,
                    'fields': ['title^10', 'overview'],  # title has 10x the weight
                },
            },
            'size': 100  # Number of results to return
        }
        
        # 4. Execute search and get results
        print(f"Searching for: '{users_search}'")
        response = search(query)
        
        print("\n" + "="*80)
        print("STEP 2: DECOMPOSING RELEVANCE SCORES")
        print("="*80)
        
        # 5. Add explain parameter to get relevance score details
        explain_query = query.copy()
        explain_query['explain'] = True
        
        # 6. Execute the query with explain
        explain_response = setup_client().search(
            body=explain_query,
            index='tmdb'
        )
        
        # 7. Decompose and explain the relevance scores
        decompose_relevance_score(explain_response, 5)
        
        print("\n" + "="*80)
        print("STEP 3: FIXING THE RANKING WITH ADJUSTED FIELD WEIGHTS")
        print("="*80)
        
        # 8. Updated query with adjusted field weights
        adjusted_query = {
            'query': {
                'multi_match': { 
                    'query': users_search,
                    'fields': ['title^0.1', 'overview'],  # title now has 0.1x the weight of overview
                },
            },
            'explain': True,
            'size': 100
        }
        
        print(f"Searching with adjusted weights for: '{users_search}'")
        adjusted_response = search(adjusted_query)
        
        # 9. Explain the new ranking
        print("\nExplaining new ranking with adjusted weights:")
        decompose_relevance_score(adjusted_response, 3)
        
        # 10. Compare results
        print("\n" + "="*80)
        print("COMPARISON OF RANKINGS")
        print("="*80)
        
        print("\nOriginal Top 5 (title^10, overview):")
        for i, hit in enumerate(response['hits']['hits'][:5]):
            print(f"{i+1}. {hit['_source']['title']} (Score: {hit['_score']:.4f})")
            
        print("\nAdjusted Top 5 (title^0.1, overview):")
        for i, hit in enumerate(adjusted_response['hits']['hits'][:5]):
            print(f"{i+1}. {hit['_source']['title']} (Score: {hit['_score']:.4f})")
        
    else:
        print("No movie data found. Please ensure the tmdb.json file exists.")

if __name__ == "__main__":
    run_search_workflow()

Deleted existing index
Created index with mappings: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'tmdb'}
Building bulk request...
Indexing documents...
Successfully indexed 3051 documents

STEP 1: INITIAL SEARCH
Searching for: 'basketball with cartoon aliens'

Search Results:
Num	Relevance Score		Movie Title
1	34.0718		Alien
2	34.0718		Aliens
3	33.6063		The Basketball Diaries
4	27.4794		Cowboys & Aliens
5	19.8125		Aliens vs Predator: Requiem

Total results: 48

STEP 2: DECOMPOSING RELEVANCE SCORES

Explain for: Alien (Score: 34.0718)
--------------------------------------------------------------------------------
34.0718 = max of:
  34.0718 = sum of:
    34.0718 = weight(title:alien in 229) [PerFieldSimilarity], result of:
      34.0718 = score(freq=1.0), computed as boost * idf * tf from:
        10.0000 = boost
        5.8273 = idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
          5.0000 = n, number of documents containing term
          1866.0000 = N, t

# 3.4.4	Fixing Space Jam vs Alien Ranking