In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
import pandas as pd
import numpy as np
from langchain_core.documents import Document

In [2]:
movies = pd.read_csv(r"C:\Work\DEPI graduation project\data\processed_movies.csv")

In [3]:
movies["id_overview"].to_csv("id_overview.txt", index=False, header=False)

In [3]:
from collections.abc import Iterable
from ast import literal_eval

def to_name_list(value):
    if isinstance(value, list):
        source = value
    elif pd.isna(value) or str(value).strip() == "":
        return []
    else:
        try:
            source = literal_eval(value)
        except (ValueError, SyntaxError):
            # fall back to comma-separated strings
            return [part.strip() for part in str(value).split(",") if part.strip()]

    if isinstance(source, dict):
        source = [source]
    elif not isinstance(source, Iterable) or isinstance(source, (str, bytes, int, float)):
        return []

    names = []
    for item in source:
        if isinstance(item, dict) and "name" in item:
            names.append(item["name"])
        else:
            names.append(str(item))
    return names
def list_to_str(values):
    if not values:
        return ""
    if isinstance(values, list):
        return ", ".join(values)
    return str(values)

documents = [
    Document(
        page_content=row["id_overview"],
        metadata={
            "movie_id": int(row["id"]),
            "title": row["title"],
            "genres": list_to_str(to_name_list(row["genres"])),
            "production_companies": list_to_str(
                to_name_list(row["production_companies"])
            ),
        },
    )
    for _, row in movies.iterrows()
]



In [4]:
documents[0]

Document(metadata={'movie_id': 27205, 'title': 'Inception', 'genres': 'Action, Science Fiction, Adventure', 'production_companies': 'Legendary Pictures, Syncopy, Warner Bros. Pictures'}, page_content='ID: 27205 | Overview: Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person\'s idea into a target\'s subconscious. | Genres: Action, Science Fiction, Adventure | Production companies: Legendary Pictures, Syncopy, Warner Bros. Pictures')

In [5]:
persist_dir = "chroma_store"
db_movies = Chroma.from_documents(
    documents=documents,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    persist_directory=persist_dir,
)

In [7]:
persist_dir = "chroma_store"
db_movies = Chroma(
    persist_directory=persist_dir,
    embedding_function=OllamaEmbeddings(model="nomic-embed-text"),
)
print(db_movies._collection.count())  # should match number of documents

1043938


In [6]:
# Example similarity search
query = "space adventure with robots"
top_k = 5

results = db_movies.similarity_search(query, k=top_k)
for rank, doc in enumerate(results, start=1):
    print(f"{rank}. {movies.loc[movies['id'] == doc.metadata['movie_id']]['title'].values[0]} (ID {doc.metadata['movie_id']})")
    print(f"   Genres: {doc.metadata['genres']}")
    print(doc.page_content[:200], "...\n")  # show start of overview for context

1. The Robotanist (ID 554157)
   Genres: Adventure
ID: 554157 | Overview: A robot named Darwin is sent on a journey, to explore the universe in the hopes of discovering a new home for the human race. On his mission he uncovers the wonder and beauty of ...

2. Teens in the Universe (ID 20949)
   Genres: Science Fiction, Family, Adventure, Comedy
ID: 20949 | Overview: Interstellar expedition equipped by "pioneers"(soviet scouts) reached Alpha Kassiopea and found that smart robots took control on hole planet. Their only goal - to make happy , a ...

3. Megaman (ID 1040227)
   Genres: Science Fiction, Action
ID: 1040227 | Overview: A brave robot volunteers to combat the mechanoid minions of a mad scientist bent on world domination. | Genres: Science Fiction, Action | Production companies: Sollar Systems S ...

4. Gladiformers (ID 199895)
   Genres: Adventure, Animation, Science Fiction, Action
ID: 199895 | Overview: Robots in disguise are forced to combat each other in a colosseum where it