In [1]:
import sys
import os
import pandas as pd
import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [3]:
df = pd.read_csv('../Data/final_metadata.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19934 entries, 0 to 19933
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 19934 non-null  int64  
 1   title              19934 non-null  object 
 2   genres             19934 non-null  object 
 3   original_language  19934 non-null  object 
 4   overview           19934 non-null  object 
 5   popularity         19934 non-null  float64
 6   keywords           19934 non-null  object 
 7   year               19934 non-null  int64  
 8   cast               19934 non-null  object 
 9   director           19934 non-null  object 
 10  score              19934 non-null  float64
 11  poster_path        19934 non-null  object 
 12  soup               19934 non-null  object 
dtypes: float64(2), int64(2), object(9)
memory usage: 2.0+ MB


In [5]:
df.drop(columns=['release_date','vote_average', 'vote_count'], inplace=True)

# Combine title, synopsis, genre and director into a soup
df['soup'] = df.apply(lambda row: f"Title: {row['title']}. Genres: {row['genres']}. Keywords: {row['keywords']}. Cast: {row['cast']}. Director: {row['director']}.", axis=1)
df['soup'][0]

'Title: Godzilla x Kong: The New Empire. Genres: Science Fiction Action Adventure. Keywords: giantmonster sequel dinosaur kaiju fantasyworld giantape godzilla kingkong mongkey. Cast: RebeccaHall BrianTyreeHenry DanStevens. Director: Adam Wingard.'

In [7]:
from langchain.docstore.document import Document

# Manually creating LangChain document objects
movies = []

for index, row in df.iterrows():
    x = Document(page_content=row['soup'], 
                 metadata={
                     "movie": row['title'],
                     "language": row['original_language'], 
                     "popularity": row['popularity'], 
                     "year": row['year'],
                     "synopsis": row['overview'],
                     "score": row['score'],
                     "poster_path": row['poster_path']
                     })
    movies.append(x)

In [9]:
movies[10]

Document(metadata={'movie': 'Kingdom of the Planet of the Apes', 'language': 'English', 'popularity': 2372.67, 'year': 2024, 'synopsis': "Several generations in the future following Caesar's reign apes are now the dominant species and live harmoniously while humans have been reduced to living in the shadows. As a new tyrannical ape leader builds his empire one young ape undertakes a harrowing journey that will cause him to question all that he has known about the past and to make choices that will define a future for apes and humans alike.", 'score': 6.764617572556002, 'poster_path': '/gKkl37BQuKTanygYQG1pyYgLVgf.jpg'}, page_content='Title: Kingdom of the Planet of the Apes. Genres: Science Fiction Adventure Action. Keywords: empire kingdom gorilla dystopia eagle sequel anthropomorphism distantfuture ape orangutan evolution chimpanzee primate curious holyromanempire cgi liveactionhybrid clan post apocalyptic eviltyrant dramatic intense cgi. Cast: OwenTeague FreyaAllan KevinDurand. Dire

In [11]:
from langchain_huggingface import HuggingFaceEmbeddings

# Specify embedding model (using HuggingFace sentence transformer)
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"

#model_kwargs = {"device": "cuda"} if cuda is enabled
# Use "mps" for Mac M1 GPU acceleration or "cpu" for CPU
model_kwargs = {"device": "mps"}  # or "cpu" if you prefer to use the CPU

embeddings = HuggingFaceEmbeddings(
  model_name=embedding_model_name, 
  model_kwargs=model_kwargs
)


# Initialize the embeddings
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=model_kwargs
)

In [13]:
from langchain.vectorstores import FAISS

# Ingest the documents into FAISS
vector_db = FAISS.from_documents(movies, embeddings)

## Testing setup
# Indexing the titles
soups = pd.Series(df['soup'].values, index=df['title'])

In [14]:
def get_recommendations(title):
    
    if title not in soups:
        raise ValueError(f"Title '{title}' not found in indices")
    
    query = soups.get(title)  
    
    try:
        results = vector_db.similarity_search_with_score(query, k=11)

        top_ten = []

        for x in results[1:]:
            movie_metadata = {
                'movie': x[0].metadata['movie'],
                'language': x[0].metadata['language'],
                'popularity': x[0].metadata['popularity'],
                'score': round(x[0].metadata['score'],1),
                'synopsis': x[0].metadata['synopsis'],
                'year': x[0].metadata['year'],
                'similarity_score': round(x[1], 2)
            }
            top_ten.append(movie_metadata)

        df_top_ten = pd.DataFrame(top_ten)
        df_top_ten = df_top_ten.sort_values(by=['score', 'popularity'], ascending=[False, False])[['movie', 'language','score','year','similarity_score']]

        return df_top_ten

    except Exception as e:
        print(f"Error during query: {e}")
        return None

In [15]:
get_recommendations('The Dark Knight Rises')

Unnamed: 0,movie,language,score,year,similarity_score
0,The Dark Knight,English,8.5,2008,0.23
4,Batman Begins,English,7.7,2005,0.52
6,The Batman,English,7.6,2022,0.62
3,"Batman: The Dark Knight Returns, Part 2",English,7.5,2013,0.45
2,"Batman: The Dark Knight Returns, Part 1",English,7.3,2012,0.45
9,Batman,English,7.2,1989,0.69
1,Batman: The Dark Knight Returns,English,6.7,2013,0.42
5,Batman: Gotham Knight,English,6.6,2008,0.62
8,Batman: Gotham by Gaslight,English,6.6,2018,0.69
7,Knights of Badassdom,English,6.2,2013,0.65


In [16]:
get_recommendations('Hulk')

Unnamed: 0,movie,language,score,year,similarity_score
2,Planet Hulk,English,6.7,2010,0.56
5,Hulk vs. Wolverine,English,6.6,2009,0.63
8,Hulk vs. Thor,English,6.6,2009,0.65
4,The Trial of the Incredible Hulk,English,6.5,1989,0.59
6,Hulk Vs,English,6.5,2009,0.63
3,Hulk: Where Monsters Dwell,English,6.5,2016,0.59
1,The Incredible Hulk,English,6.4,1977,0.53
9,The Incredible Hulk Returns,English,6.4,1988,0.69
7,The Death of the Incredible Hulk,English,6.3,1990,0.63
0,The Incredible Hulk,English,6.2,2008,0.47
