In [1]:
import pandas as pd
import faiss
import openai
import streamlit as st
import numpy as np
import logging
import os
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
openai.api_key = os.getenv("OPENAI_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
names = pd.read_table("/Users/jhajsidhu/Downloads/imdb_data/name.basics.tsv", low_memory = False, encoding_errors = "ignore")
basics = pd.read_table("/Users/jhajsidhu/Downloads/imdb_data/title.basics.tsv", low_memory = False, encoding_errors = "ignore")
crews = pd.read_table("/Users/jhajsidhu/Downloads/imdb_data/title.crew.tsv", low_memory = False, encoding_errors = "ignore")
principals = pd.read_table("/Users/jhajsidhu/Downloads/imdb_data/title.principals.tsv", low_memory = False, encoding_errors = "ignore")
ratings = pd.read_table("/Users/jhajsidhu/Downloads/imdb_data/title.ratings.tsv", low_memory = False, encoding_errors = "ignore")

In [None]:
names = names.drop_duplicates(subset = "nconst")
basics = basics.drop_duplicates(subset = "tconst")
crews = crews.drop_duplicates(subset = "tconst")
principals = principals.drop_duplicates(subset = "tconst")
ratings = ratings.drop_duplicates(subset = "tconst")

In [None]:
names = names.replace("\\N", np.nan)
names_columns = ["nconst", "primaryName", "birthYear", "deathYear", "primaryProfession"]
names = names[names_columns]
names["birthYear"] = pd.to_numeric(names["birthYear"], errors = "coerce").astype("Int64")
names["deathYear"] = pd.to_numeric(names["deathYear"], errors = "coerce").astype("Int64")
names = names.dropna(subset = ["nconst", "primaryName", "birthYear", "deathYear"])
print(names.head())

In [None]:
basics = basics.replace("\\N", np.nan)
basics_columns = ["tconst", "primaryTitle", "startYear", "genres"]
basics = basics[basics_columns]
basics["startYear"] = pd.to_numeric(basics["startYear"], errors = "coerce").astype("Int64")
basics = basics.dropna(subset = ["tconst", "primaryTitle", "startYear"])
print(basics.head())

In [None]:
crews = crews.replace("\\N", np.nan)
crews_columns = ["tconst", "directors", "writers"]
crews = crews[crews_columns]
crews = crews.dropna(subset = ["directors", "writers"], how = "all")
crews = crews.dropna(subset = ["tconst"])
print(crews.head())

In [None]:
principals = principals.replace("\\N", np.nan)
principals_columns = ["tconst", "ordering", "nconst", "category", "job", "characters"]
principals = principals[principals_columns]
principals = principals.dropna(subset = ["tconst", "category"])
print(principals.head())

In [None]:
ratings = ratings.replace("\\N", np.nan)
ratings_columns = ["tconst", "averageRating", "numVotes"]
ratings = ratings[ratings_columns]
ratings["numVotes"] = pd.to_numeric(ratings["numVotes"], errors = "coerce").astype("Int64")
ratings = ratings.dropna(subset = ["tconst", "averageRating", "numVotes"])
print(ratings.head())

In [None]:
movies = pd.merge(basics, ratings, on = "tconst", how = "left")
movies = pd.merge(movies, crews, on = "tconst", how = "left")
get_name = dict(zip(names["nconst"], names["primaryName"]))
movies["directorNames"] = movies["directors"].map(get_name)
movies["writerNames"] = movies["writers"].map(get_name)
movies_columns = ["tconst", "primaryTitle", "startYear", "genres", "averageRating", "numVotes", "directorNames", "writerNames"]
movies = movies[movies_columns]
cast = principals[principals["category"].isin(["actor", "actress"])]
cast = cast.merge(names[["nconst", "primaryName"]], on = "nconst", how = "left")
cast["ordering"] = pd.to_numeric(cast["ordering"], errors = "coerce")
cast["ordering"] = cast["ordering"].astype("Int64")
top_cast = (cast.sort_values(["tconst", "ordering"]).groupby("tconst").head(5).groupby("tconst")["primaryName"].apply(lambda names_list: ", ".join(str(name) for name in names_list if isinstance(name, str) and name.strip())).reset_index().rename(columns = {"primaryName": "cast"}))
movies = movies.merge(top_cast, on = "tconst", how = "left")
movies_sorted = movies
movies_sorted.to_csv("clean_movies.csv", index = False)
print(movies_sorted.head())

In [7]:
movies_sorted = pd.read_csv("clean_movies.csv")
movies_sorted = movies_sorted[
    (movies_sorted['startYear'] >= 1970) &
    (movies_sorted['numVotes'] > 1000.0) &
    (movies_sorted['averageRating'] >= 6.0)
]
print(len(movies_sorted))
movies_sorted['description'] = (
    "Title: " + movies_sorted['primaryTitle'].fillna('') + 
    ". Year: " + movies_sorted['startYear'].astype(str) + 
    ". Genres: " + movies_sorted['genres'].fillna('') + 
    ". Directed by: " + movies_sorted['directorNames'].fillna('') + 
    ". Written by: " + movies_sorted['writerNames'].fillna('') + 
    ". Cast: " + movies_sorted['cast'].fillna('') + 
    ". IMDb Rating: " + movies_sorted['averageRating'].astype(str)
)
movies_sorted.to_csv("clean_movies.csv", index = False)

71844


In [8]:
df = pd.read_csv("clean_movies.csv") # file
model = SentenceTransformer('all-MiniLM-L6-v2') # embedding model
descriptions = df['description'].fillna("").tolist()
embeddings = model.encode(descriptions, batch_size=64, show_progress_bar=True) # embeds description
embedding_matrix = np.array(embeddings).astype("float32") # embeddings to numpy array
embedding_matrix = normalize(embedding_matrix, axis=1) # cosine similarity (based on content not length)
index = faiss.IndexFlatIP(embedding_matrix.shape[1])
index.add(embedding_matrix) # FAISS index for similarity search
faiss.write_index(index, "movie_index.faiss") # save index for reuse

Batches: 100%|██████████████████████████████| 1123/1123 [00:58<00:00, 19.07it/s]


In [2]:
df = pd.read_csv("clean_movies.csv") # load dataset
model = SentenceTransformer('all-MiniLM-L6-v2') # load sentence transformer model
index = faiss.read_index("movie_index.faiss") # load FAISS index

def search_movies(query, top_k=5):
    query_embedding = model.encode([query]) # embeds query
    query_embedding = normalize(np.array(query_embedding).astype("float32"), axis=1) # normalizes
    distances, indices = index.search(query_embedding, top_k) # searches
    results = df.iloc[indices[0]] # closest results
    return results

def generate_answer(query, top_k=5):
    results = search_movies(query, top_k) # movie search
    context = "\n\n".join(results['description'].tolist()) # context for openai

    prompt = f"Answer the question based on the movie data below:\n\n{context}\n\nQuestion: {query}" # prompt for model

    response = openai.chat.completions.create(
        model="gpt-4", # model
        messages=[{"role": "user", "content": prompt}] # message sent
    )
    return response.choices[0].message.content # response

In [7]:
query = "What's the best rated John Wick movie?."
movies_found = search_movies(query)
print(movies_found[['primaryTitle', 'description']])

answer = generate_answer(query)
print(answer)

                                            primaryTitle  \
57151                               John Wick: Chapter 2   
23254                               John Wick: Chapter 4   
65500  Retro Wick: Exploring the Unexpected Success o...   
49418                                          John Wick   
65502                   John Wick Chapter 2: Wick-vizzed   

                                             description  
57151  Title: John Wick: Chapter 2. Year: 2017. Genre...  
23254  Title: John Wick: Chapter 4. Year: 2023. Genre...  
65500  Title: Retro Wick: Exploring the Unexpected Su...  
49418  Title: John Wick. Year: 2014. Genres: Action,C...  
65502  Title: John Wick Chapter 2: Wick-vizzed. Year:...  
The best rated John Wick movie is "John Wick Chapter 2: Wick-vizzed" with an IMDb rating of 7.8.
