In [19]:
!pip install rank_bm25



In [20]:
import os
from rank_bm25 import BM25Okapi
import json
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
# Preprocess text and prepare corpus
def preprocess(text):
    tokens = text.split()
    stopwords_set = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stopwords_set]
    return tokens


In [22]:
current_directory = os.getcwd()
with open(os.path.join(current_directory, "postgres/movies.json"), "r") as f:
    movies = json.load(f)

movies_data = []
for movie in movies["films"]["film"]:

    roles = movie.get("role", [])
    if isinstance(roles, dict):  # If 'roles' is a dictionary, make it a single-item list
        roles = [roles]

    # Extract actor information
    actors = []
    for role in roles:
        actor_info = role.get("acteur", {})
        if "__text" in actor_info:
            actors.append(actor_info["__text"])

    movies_data.append({
        "title": movie.get("titre", ""),
        "year": movie.get("annee", ""),
        "country": movie.get("pays", ""),
        "language": movie.get("langue", ""),
        "duration": movie.get("duree", ""),
        "summary": movie.get("synopsis", ""),
        "genre": movie.get("genre", ""),
        "director": movie.get("realisateur", {"__text": ""}).get("__text", ""),
        "writers": movie.get("scenariste", []),
        "actors": actors,
        "poster": movie.get("affiche", ""),
        "id": movie.get("id", "")
    })

In [23]:
# Prepare the corpus for BM25
corpus = [
    preprocess(f"{movie['title']} {movie['year']} {' '.join(movie['genre'])} {' '.join(movie['actors'])} {movie['director']} {movie['summary']} {movie['country']}")
    for movie in movies_data
]


In [24]:
# Initialize BM25
bm25 = BM25Okapi(corpus)

In [25]:
# Function to search movies using BM25
def search_movies(query, bm25, movies_data, top_n=10):
    query_tokens = preprocess(query)
    scores = bm25.get_scores(query_tokens)
    top_n_indices = np.argsort(scores)[::-1][:top_n]
    top_n_movies = [movies_data[i] for i in top_n_indices]
    return top_n_movies

In [26]:
# Example usage
query = "science fiction 2021"
top_movies = search_movies(query, bm25, movies_data)

for idx, movie in enumerate(top_movies):
    print(f"Rank {idx + 1}: {movie['title']} ({movie['year']})")

Rank 1: Elf (2003)
Rank 2: Rush Hour (1998)
Rank 3: My Big Fat Greek Wedding (2002)
Rank 4: Stir Crazy (1980)
Rank 5: Slumdog Millionaire (2008)
Rank 6: Lethal Weapon 4 (1998)
Rank 7: The Hunchback of Notre Dame (1996)
Rank 8: Witness for the Prosecution (1957)
Rank 9: The Great Escape (1963)
Rank 10: Cool Hand Luke (1967)
