In [1]:
!pip install rank_bm25

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os
from rank_bm25 import BM25Okapi
import json
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Preprocess text and prepare corpus

In [3]:
def preprocess(text):
    tokens = text.split()
    stopwords_set = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stopwords_set]
    tokens = [word.lower() for word in tokens if word.isalnum()]
    return tokens

In [4]:
current_directory = os.getcwd()
with open(os.path.join(current_directory, "movies.json"), "r", encoding="utf-8") as f:
    movies = json.load(f)

movies_data = []
for movie in movies["films"]["film"]:

    roles = movie.get("role", [])
    if isinstance(roles, dict):  # If 'roles' is a dictionary, make it a single-item list
        roles = [roles]

    # Extract actor information
    actors = []
    for role in roles:
        actor_info = role.get("acteur", {})
        if "__text" in actor_info:
            actors.append(actor_info["__text"])

    movies_data.append({
        "title": movie.get("titre", ""),
        "year": movie.get("annee", ""),
        "country": movie.get("pays", ""),
        "language": movie.get("langue", ""),
        "duration": movie.get("duree", ""),
        "summary": movie.get("synopsis", ""),
        "genre": movie.get("genre", ""),
        "director": movie.get("realisateur", {"__text": ""}).get("__text", ""),
        "writers": movie.get("scenariste", []),
        "actors": actors,
        "poster": movie.get("affiche", ""),
        "id": movie.get("id", "")
    })

# Prepare the corpus for BM25

In [5]:
corpus = [
    preprocess(f"{movie['title']} {movie['year']} {' '.join(movie['genre'])} {' '.join(movie['actors'])} {movie['director']} {movie['summary']} {movie['country']}")
    for movie in movies_data
]
corpus


[['george',
  'jungle',
  '1997',
  'action',
  'adventure',
  'comedy',
  'family',
  'romance',
  'brendan',
  'fraser',
  'leslie',
  'mann',
  'thomas',
  'haden',
  'church',
  'john',
  'cleese',
  'holland',
  'taylor',
  'michael',
  'chinyamurindi',
  'abdoulaye',
  'ngom',
  'sam',
  'weisman',
  'usa'],
 ['robin',
  'prince',
  'thieves',
  '1991',
  'action',
  'adventure',
  'drama',
  'romance',
  'kevin',
  'costner',
  'morgan',
  'freeman',
  'mary',
  'elizabeth',
  'mastrantonio',
  'christian',
  'slater',
  'alan',
  'rickman',
  'michael',
  'mcshane',
  'michael',
  'wincott',
  'nick',
  'brimble',
  'daniel',
  'newman',
  'jack',
  'wild',
  'kevin',
  'reynolds',
  'usa'],
 ['2001',
  'animation',
  'adventure',
  'comedy',
  'family',
  'fantasy',
  'john',
  'goodman',
  'billy',
  'crystal',
  'mary',
  'gibbs',
  'steve',
  'buscemi',
  'james',
  'coburn',
  'jennifer',
  'tilly',
  'bob',
  'peterson',
  'john',
  'ratzenberger',
  'frank',
  'oz',
  'd

# Initialize BM25

In [6]:
bm25 = BM25Okapi(corpus)

# Function to search movies using BM25

In [7]:
def search_movies(query, bm25, movies_data, top_n=10):
    query_tokens = preprocess(query)
    scores = bm25.get_scores(query_tokens)
    top_n_indices = np.argsort(scores)[::-1][:top_n]
    top_n_movies = [movies_data[i] for i in top_n_indices]
    return top_n_movies

# Example usage

In [8]:
query = "action 2002"
top_movies = search_movies(query, bm25, movies_data)

for idx, movie in enumerate(top_movies):
    print(f"Rank {idx + 1}: {movie['title']} ({movie['year']})")

Rank 1: Minority Report (2002)
Rank 2: xXx (2002)
Rank 3: The Sum of All Fears (2002)
Rank 4: Cidade de Deus (2002)
Rank 5: Men in Black II (2002)
Rank 6: The Ring (2002)
Rank 7: Scooby-Doo (2002)
Rank 8: The Lord of the Rings: The Two Towers (2002)
Rank 9: Spider-Man (2002)
Rank 10: Die Another Day (2002)


In [9]:
query = "brad pitt action angelina jolie"
query.lower()
top_movies = search_movies(query, bm25, movies_data)

for idx, movie in enumerate(top_movies):
    print(f"Rank {idx + 1}: {movie['title']} ({movie['year']})")

Rank 1: Mr. & Mrs. Smith (2005)
Rank 2: Changeling (2008)
Rank 3: Wanted (2008)
Rank 4: Fight Club (1999)
Rank 5: Interview with the Vampire: The Vampire Chronicles (1994)
Rank 6: Se7en (1995)
Rank 7: Lara Croft: Tomb Raider (2001)
Rank 8: Shark Tale (2004)
Rank 9: Ocean's Twelve (2004)
Rank 10: Gone in Sixty Seconds (2000)
