In [1]:
import os
import torch
from datetime import datetime
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.milvus import Milvus
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough


In [2]:
import torch

device = None
if torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

model_name = "sentence-transformers/all-MiniLM-L12-v2"
model_kwargs = {'device': device}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)



# Prima Sezione -> Interazione con Milvus

In [3]:
# Questo Blocco di Codice serve per effettuare lo Splitting dei Documenti memorizzati in "movie_text_data"
docs = list()

start = datetime.now()

for file in os.listdir('../movie_text_data/'):
    loader = TextLoader(f'../movie_text_data/{file}')
    docs.extend(loader.load())

print("Number of Documents:", len(docs))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)  # DA OTTIMIZZARE
all_splits = text_splitter.split_documents(docs)

print("Required Time:", datetime.now() - start)

Number of Documents: 58022
Required Time: 0:00:10.171321


In [4]:
# Connessione a Milvus (deve essere prima fatto partire il Container su Docker!)
vector_db = Milvus(
    embedding_function=embeddings,
    connection_args={'host':'127.0.0.1', 'port':'19530'},
    drop_old=False
)

In [8]:
# Attenzione!!! Il seguente Blocco di Codice reinizializza Milvus ed esegue il Caricamento dei Documenti memorizzati in 'all_splits'ArithmeticError
start = datetime.now()

vector_db = Milvus.from_documents(
    documents=all_splits,
    embedding=embeddings,
    connection_args={'host':'127.0.0.1', 'port':'19530'},
    collection_name="MovieTextData",
    drop_old=True
)

print("Required Time:", datetime.now() - start)

Required Time: 0:07:51.790059


In [5]:
# Esegui il seguente Blocco di Codice per ottenere informazioni sulla Collezione in Milvus
print(f"Default collection name - {vector_db.collection_name}")
print(f"Default search params - {vector_db.search_params}")
print(f"Default index params - {vector_db.index_params}")

Default collection name - LangChainCollection
Default search params - None
Default index params - None


# Seconda Sezione -> Inizializzazione di Llama3-8b-Instruct

In [6]:
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate

In [7]:
llm = LlamaCpp(
        model_path="../models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
        n_gpu_layers=-1,
        n_batch=1024,
        temperature=0.2,
        n_threads=8,
        max_tokens=8192,
        # f16_kv=True,
        # callback_manager=CallbackManager(StreamingStdOutCallbackHandler()),
        verbose=True
)

llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors from ../models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:              llama.

# Terza Sezione -> Recupero delle Preferenze di un certo User (per ora, uno User a scelta)

In [8]:
import pandas as pd
import os
os.chdir("/Users/alessandropesare/software/GitHub/BD-RecSysWithRAG/")
from inf_retriever import InfRetriever

### TEST
Users--> 111,400,7,95,8115,280,322,294,5661,15003

In [27]:
ir = InfRetriever(userId=5661)     # User scelto arbitrariamente
ir.computeUserPreferences(sample_size=10)   

# Il metodo 'computeUserPreferences' esegue le seguenti operazioni:
#   1) Recupera gli ID dei Film a cui lo User ha dato almeno '4.5' come punteggio
#   2) Campiona 'sample_size' di questi Film -> Attenzione! Se il valore di 'sample_size' è maggiore del numero di Film 'candidati'
#       questo automaticamente assume il valore 'int(0.67*len(candidati))'ArithmeticError

In [28]:
titles = ir.getMovieTitles()
titles

['Bridges of Madison County, The (1995)',
 'Apollo 13 (1995)',
 'Clear and Present Danger (1994)',
 'Net, The (1995)',
 'Disclosure (1994)',
 'Lion King, The (1994)',
 'Fugitive, The (1993)',
 'Crimson Tide (1995)',
 'Piano, The (1993)',
 'Dolores Claiborne (1995)']

In [29]:
details = ir.getMovieDetailsDF()
details

Unnamed: 0,title,director,genres,plot,year,cast
105,The Bridges of Madison County,Clint Eastwood,"Drama, Romance",Photographer Robert Kincaid wanders into the l...,1995,"Clint Eastwood, Meryl Streep, Annie Corley"
150,Apollo 13,Ron Howard,"Adventure, Drama, History",NASA must devise a strategy to return Apollo 1...,1995,"Tom Hanks, Bill Paxton, Kevin Bacon"
349,Clear and Present Danger,Phillip Noyce,"Action, Crime, Drama",CIA Analyst Jack Ryan is drawn into an illegal...,1994,"Harrison Ford, Willem Dafoe, Anne Archer"
185,The Net,Irwin Winkler,"Action, Crime, Drama","When Angela Bennett, a computer programmer, st...",1995,"Sandra Bullock, Jeremy Northam, Dennis Miller"
225,Disclosure,Barry Levinson,"Drama, Thriller",A computer specialist is sued for sexual haras...,1994,"Michael Douglas, Demi Moore, Donald Sutherland"
364,The Lion King,Roger Allers,"Animation, Adventure, Drama",Lion prince Simba and his father are targeted ...,1994,"Matthew Broderick, Jeremy Irons, James Earl Jones"
457,The Fugitive,Andrew Davis,"Action, Crime, Drama","Dr. Richard Kimble, unjustly accused of murder...",1993,"Harrison Ford, Tommy Lee Jones, Sela Ward"
161,Crimson Tide,Tony Scott,"Action, Drama, Thriller","On a U.S. nuclear missile sub, a young First O...",1995,"Gene Hackman, Denzel Washington, Matt Craven"
509,The Piano,Jane Campion,"Drama, Music, Romance",In the mid-19th century a mute woman is sent t...,1993,"Holly Hunter, Harvey Keitel, Sam Neill"
230,Dolores Claiborne,Taylor Hackford,"Crime, Drama, Mystery",A big city reporter travels to a small town wh...,1995,"Kathy Bates, Jennifer Jason Leigh, Christopher..."


# Quarta Sezione -> generazione delle Raccomandazioni

In [30]:
rec_template = """<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are an helpful AI assistant, which is expert about the field of Movies and Cinema.
You have to answer to the request in a precise way and strictly following what is asked: you MUST use the search results to build the answer.
If the asked informations don't exist within the search results, DON'T INVENT THEM AND JUST ANSWER 'I don't know how to answer to this question'. 
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Question: {question}
<|eot_id|>

<start_header_id|>assistant<|end_header_id|>
Answer: """

rec_prompt = PromptTemplate.from_template(rec_template)

In [31]:
retriever = vector_db.as_retriever()

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} | rec_prompt | llm
)

In [32]:
request = """Report me some movies which are similar to "Disclosure".
After reporting the movies, explain why you chose them."""

resp = rag_chain.invoke(request)
print(str(resp))

Llama.generate: prefix-match hit

llama_print_timings:        load time =   10721.62 ms
llama_print_timings:      sample time =    1073.03 ms /   185 runs   (    5.80 ms per token,   172.41 tokens per second)
llama_print_timings: prompt eval time =    9210.70 ms /    27 tokens (  341.14 ms per token,     2.93 tokens per second)
llama_print_timings:        eval time =  403931.11 ms /   184 runs   ( 2195.28 ms per token,     0.46 tokens per second)
llama_print_timings:       total time =  492125.22 ms /   211 tokens


 Here are some movies that are similar to "Disclosure":

1. **The Net** (1995) - This movie also explores themes of surveillance and privacy in a high-tech world.

2. **Enemy of the State** (1998) - This action-comedy film, starring Will Smith, also deals with government surveillance and corruption.

3. **Minority Report** (2002) - Based on Philip K. Dick's short story, this sci-fi thriller explores themes of free will and predestination in a world where crimes can be predicted and prevented.

4. **The Truman Show** (1998) - This satirical film, directed by Peter Weir, also explores the theme of surveillance and control in a seemingly ordinary life.

I chose these movies because they all share similar themes with "Disclosure", such as government surveillance, privacy concerns, and the blurring of lines between public and private spaces.
