In [1]:
COLLECTION_NAME = 'movies_vecdb'
DIMENSION = 384
URI = 'http://localhost:19530'

In [2]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.milvus import Milvus
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
import os

docs = []

for file in os.listdir('./movie_text_data/'):
    loader = TextLoader(f'./movie_text_data/{file}')
    docs.extend(loader.load())

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)  # DA OTTIMIZZARE
all_splits = text_splitter.split_documents(docs)

In [4]:
import torch

device = None
if torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

model_name = "sentence-transformers/all-MiniLM-L12-v2"
model_kwargs = {'device': device}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)



In [5]:
vector_db = Milvus(
    embedding_function=embeddings,
    connection_args={'host':'127.0.0.1', 'port':'19530'},
    drop_old=False
)

In [6]:
vector_db = Milvus.from_documents(
    all_splits,
    embeddings,
    connection_args={'host':'127.0.0.1', 'port':'19530'},
)

In [7]:
query = "Which are the Crime movies directed by Martin Scorsese?"
result = vector_db.similarity_search(query, k=3)

In [8]:
for n, doc in enumerate(result):
    print(f"DOCUMENT {n+1}:")
    print(doc.page_content)
    print()

DOCUMENT 1:
IMDB ID -> 0031014
TITLE -> 6,000 Enemies
DIRECTED BY -> George B. Seitz
GENRES -> Crime, Drama, Romance
PLOT -> A tough prosecutor who has sent dozens of criminals to prison finds himself framed on a bribery charge and winds up in prison himself.
YEAR -> 1939
STARS -> Walter Pidgeon, Rita Johnson, Paul Kelly

DOCUMENT 2:
IMDB ID -> 3237082
TITLE -> Serial Killer Culture
DIRECTED BY -> John Borowski
GENRES -> Documentary, Crime, History
PLOT -> Serial Killer Culture examines the reasons why artists and collectors are fascinated by serial killers.
YEAR -> 2014
STARS -> Steve Giannangelo, Rick Staton, Rich Hillen Jr.

DOCUMENT 3:
IMDB ID -> 0037622
TITLE -> Crime, Inc.
DIRECTED BY -> Lew Landers
GENRES -> Crime, Drama, Film-Noir
PLOT -> A crusading reporter uses his friendship with a mobster to expose the machinations of organized crime in his city.
YEAR -> 1945
STARS -> Leo Carrillo, Tom Neal, Martha Tilton



In [9]:
print(f"Default collection name - {vector_db.collection_name}")
print(f"Default search params - {vector_db.search_params}")
print(f"Default index params - {vector_db.index_params}")

Default collection name - LangChainCollection
Default search params - {'metric_type': 'L2', 'params': {'ef': 10}}
Default index params - None


In [10]:
vector_db.col

<Collection>:
-------------
<name>: LangChainCollection
<description>: 
<schema>: {'auto_id': True, 'description': '', 'fields': [{'name': 'source', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'pk', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'enable_dynamic_field': False}

In [11]:
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate

In [12]:
llm = LlamaCpp(
        model_path="./models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
        n_gpu_layers=-1,
        n_batch=1024,
        temperature=0.2,
        n_threads=8,
        max_tokens=4096,
        # f16_kv=True,
        # callback_manager=CallbackManager(StreamingStdOutCallbackHandler()),
        verbose=True
)

llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors from ./models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:              llama.a

## Template 1 (forse da rimuovere)

In [11]:
from langchain_core.prompts import PromptTemplate

template = """<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are an helpful AI assistant, which is expert about the field of Movies and Cinema.
You have to answer to the question in a precise way: you MUST use the search results to build the answer.
If the asked informations don't exist within the search results, DON'T INVENT THEM AND JUST ANSWER 'I don't know how to answer to this question'. 

Generate some movies and, for each movie, report these informations following this structure:
* Title: report here the title of the movie
* Year: report here the year of the movie
* Genre: report here the list of genres for the movie
* Director: report here the director 
* Brief description of the plot: report here a brief description of the plot
* Main actors: report here 2 or 3 members of the cast
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Question: {question}
<|eot_id|>

<start_header_id|>assistant<|end_header_id|>
Answer: """

prompt = PromptTemplate.from_template(template)

## Template 2

In [13]:
from langchain_core.prompts import PromptTemplate

template = """<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are an helpful AI assistant, which is expert about the field of Movies and Cinema.
You have to answer to the question in a precise way: you MUST use the search results to build the answer.
If the asked informations don't exist within the search results, DON'T INVENT THEM AND JUST ANSWER 'I don't know how to answer to this question'. 
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Question: {question}
<|eot_id|>

<start_header_id|>assistant<|end_header_id|>
Answer: """

prompt = PromptTemplate.from_template(template)

In [14]:
from langchain_core.runnables import RunnablePassthrough

retriever = vector_db.as_retriever()

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} | prompt | llm
)

In [15]:
request = """Report me some movies which correspond to at least one of these features: the movie directed by Spike Lee.
Give more importance to a movie if it corresponds to more than one of the listed features.
After reporting the movies, explain why you chose them."""

resp = rag_chain.invoke(request)
print(str(resp))


llama_print_timings:        load time =   10518.49 ms
llama_print_timings:      sample time =     127.86 ms /   179 runs   (    0.71 ms per token,  1400.02 tokens per second)
llama_print_timings: prompt eval time =   10517.90 ms /   154 tokens (   68.30 ms per token,    14.64 tokens per second)
llama_print_timings:        eval time =   17722.33 ms /   178 runs   (   99.56 ms per token,    10.04 tokens per second)
llama_print_timings:       total time =   31199.53 ms /   332 tokens


 Movies Directed by Spike Lee with Specific Features:

1. **Do the Right Thing (1989)** - This film is directed by Spike Lee and corresponds to the feature "the movie directed by Spike Lee".
2. **Malcolm X (1992)** - This film is directed by Spike Lee, stars Denzel Washington, and corresponds to the features "the movie directed by Spike Lee" and "a movie starring Denzel Washington".
3. **25th Hour (2002)** - This film is directed by Spike Lee, stars Edward Norton, and corresponds to the features "the movie directed by Spike Lee" and "a movie starring Edward Norton".

I chose these movies because they all meet at least one of the specified criteria: being directed by Spike Lee. The movies also have additional features that make them stand out, such as starring notable actors like Denzel Washington or Edward Norton.


## Test 1: input query --> a famous movie "Titanic"

In [17]:
request = """Report me some movies which are similar to "Titanic".
After reporting the movies, explain why you chose them."""

resp = rag_chain.invoke(request)
print(str(resp))

Llama.generate: prefix-match hit

llama_print_timings:        load time =   10518.49 ms
llama_print_timings:      sample time =     924.39 ms /   190 runs   (    4.87 ms per token,   205.54 tokens per second)
llama_print_timings: prompt eval time =    8960.12 ms /    28 tokens (  320.00 ms per token,     3.12 tokens per second)
llama_print_timings:        eval time =  257791.62 ms /   189 runs   ( 1363.98 ms per token,     0.73 tokens per second)
llama_print_timings:       total time =  326119.75 ms /   217 tokens


 Here are some movies that are similar to "Titanic":

1. **The Poseidon Adventure** (1972) - This disaster movie also features a catastrophic event at sea, where passengers must fight for survival.

2. **A Night to Remember** (1958) - This British drama film is based on the true story of the RMS Titanic's sinking. The film focuses on the personal stories of the passengers and crew.

3. **The Unsinkable Molly Brown** (1964) - This biographical musical comedy-drama film tells the story of Margaret "Molly" Brown, a wealthy socialite who survived the sinking of the RMS Titanic.

I chose these movies because they all share similar themes with "Titanic", such as disaster, survival, romance, and tragedy. They also feature strong characters, dramatic plot twists, and memorable music, which are all key elements that make "Titanic" so iconic and beloved.


## Test 2: input query --> generic movies "Just Before I Go" and "Deewaar"

In [16]:
request = """Report me some movies which are similar to "Just Before I go" or "Deewaar".
After reporting the movies, explain why you chose them."""

resp = rag_chain.invoke(request)
print(str(resp))

Llama.generate: prefix-match hit

llama_print_timings:        load time =   10518.49 ms
llama_print_timings:      sample time =     692.50 ms /   195 runs   (    3.55 ms per token,   281.59 tokens per second)
llama_print_timings: prompt eval time =    9327.40 ms /    40 tokens (  233.19 ms per token,     4.29 tokens per second)
llama_print_timings:        eval time =  140445.04 ms /   194 runs   (  723.94 ms per token,     1.38 tokens per second)
llama_print_timings:       total time =  182613.77 ms /   234 tokens


 Here are some movies that are similar to "Just Before I Go" and "Deewaar":

1. **The Breakfast Club** (1985) - This classic John Hughes film explores themes of identity, social class, and the struggles of adolescence.
2. **Stand by Me** (1986) - Based on a novella by Stephen King, this coming-of-age drama follows four young friends as they embark on a journey to find the body of a missing boy.
3. **The Perks of Being a Wallflower** (2012) - This coming-of-age drama follows Charlie, a shy and introverted teenager, as he navigates his freshman year of high school and struggles with mental health issues.

I chose these movies because they all share similar themes and elements to "Just Before I Go" and "Deewaar". They are all coming-of-age stories that explore the struggles and challenges faced by teenagers as they navigate their way through adolescence.


## Test 3: input query --> generic movies with different features in AND

In [18]:
request = """Report me some movies which are similar to "Sniper: Ghost Shooter" and "Lady Ice".
After reporting the movies, explain why you chose them."""

resp = rag_chain.invoke(request)
print(str(resp))

Llama.generate: prefix-match hit

llama_print_timings:        load time =   10518.49 ms
llama_print_timings:      sample time =    1005.27 ms /   183 runs   (    5.49 ms per token,   182.04 tokens per second)
llama_print_timings: prompt eval time =    7371.58 ms /    36 tokens (  204.77 ms per token,     4.88 tokens per second)
llama_print_timings:        eval time =  171061.17 ms /   182 runs   (  939.90 ms per token,     1.06 tokens per second)
llama_print_timings:       total time =  230175.28 ms /   218 tokens


 Movies Similar to "Sniper: Ghost Shooter" and "Lady Ice":

1. **The Accountant** (2016) - This action-thriller stars Ben Affleck as a socially awkward accountant who moonlights as an assassin.
2. **Atomic Blonde** (2017) - Set in 1980s Berlin, this spy thriller follows Charlize Theron's character, Lorraine Broughton, a top-level spy sent to Berlin to take down a espionage ring.
3. **The Bourne Legacy** (2012) - This action-thriller is part of the Jason Bourne series and stars Jeremy Renner as Aaron Cross, a highly trained assassin suffering from memory loss.

I chose these movies because they share similar themes and elements with "Sniper: Ghost Shooter" and "Lady Ice". They all feature high-stakes action sequences, complex characters, and intricate plots.


## Test 4: input query --> generic movies with very different features in AND

In [21]:
request = """Report me some movies which are similar to "Chasing Ghosts: Beyond the Arcade" or "Dancin': It's on!".
After reporting the movies, explain why you chose them."""

resp = rag_chain.invoke(request)
print(str(resp))

Llama.generate: prefix-match hit

llama_print_timings:        load time =   10518.49 ms
llama_print_timings:      sample time =     957.06 ms /   173 runs   (    5.53 ms per token,   180.76 tokens per second)
llama_print_timings: prompt eval time =    4304.55 ms /    36 tokens (  119.57 ms per token,     8.36 tokens per second)
llama_print_timings:        eval time =  228255.00 ms /   172 runs   ( 1327.06 ms per token,     0.75 tokens per second)
llama_print_timings:       total time =  279139.75 ms /   208 tokens


 Movies Similar to "Chasing Ghosts: Beyond the Arcade" and "Dancin': It's on!":

1. **The King of Kong: A Fistful of Quarters** (2007) - This documentary-style comedy film is about a group of gamers competing in a Donkey Kong tournament.
2. **The Gamers: Dorkness Rising** (2008) - This mockumentary film follows a group of friends who are obsessed with tabletop role-playing games.
3. **Win, Lose or Draw** (1989) - This family-friendly game show features contestants competing in various drawing challenges.

I chose these movies because they all share elements of competition, gaming, and camaraderie that are similar to the themes found in "Chasing Ghosts: Beyond the Arcade" and "Dancin': It's on!".
