In [1]:
import os
import torch
from datetime import datetime

In [2]:
# Inizializzazione del Device
device = None
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Device:", torch.cuda.get_device_name(0))
else:
    device = torch.device('cpu')
    print("Device: CPU")

Device: NVIDIA GeForce RTX 3060


# Prima Sezione -> Interazione con Milvus

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.milvus import Milvus
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
# Questo Blocco di Codice serve per effettuare lo Splitting dei Documenti memorizzati in "movie_text_data"
docs = list()

start = datetime.now()

for file in os.listdir('./movie_text_data/'):
    loader = TextLoader(f'./movie_text_data/{file}')
    docs.extend(loader.load())

print("Number of Documents:", len(docs))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)  # DA OTTIMIZZARE
all_splits = text_splitter.split_documents(docs)

print("Required Time:", datetime.now() - start)

Number of Documents: 58022
Required Time: 0:07:07.848427


In [5]:
# Inizializzazione del Modello di Embedding -> all-MiniLM-L12-v2 (HuggingFace, len(Spazio Latente)=384)
model_name = "sentence-transformers/all-MiniLM-L12-v2"
model_kwargs = {'device': device}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [None]:
# Connessione a Milvus (deve essere prima fatto partire il Container su Docker!)
vector_db = Milvus(
    embedding_function=embeddings,
    connection_args={'host':'127.0.0.1', 'port':'19530'},
    drop_old=False
)

In [6]:
# Attenzione!!! Il seguente Blocco di Codice reinizializza Milvus ed esegue il Caricamento dei Documenti memorizzati in 'all_splits'ArithmeticError
start = datetime.now()

vector_db = Milvus.from_documents(
    documents=all_splits,
    embedding=embeddings,
    connection_args={'host':'127.0.0.1', 'port':'19530'},
    collection_name="MovieTextData",
    drop_old=True
)

print("Required Time:", datetime.now() - start)

Required Time: 0:01:30.029326


In [7]:
# Esegui il seguente Blocco di Codice per ottenere informazioni sulla Collezione in Milvus
print(f"Default collection name - {vector_db.collection_name}")
print(f"Default search params - {vector_db.search_params}")
print(f"Default index params - {vector_db.index_params}")

Default collection name - MovieTextData
Default search params - {'metric_type': 'L2', 'params': {'ef': 10}}
Default index params - {'metric_type': 'L2', 'index_type': 'HNSW', 'params': {'M': 8, 'efConstruction': 64}}


# Seconda Sezione -> Inizializzazione di Llama3-8b-Instruct

In [8]:
from langchain_community.llms import LlamaCpp

In [9]:
model_path = "./models/Meta-Llama-3-8B-Instruct-Q6_K.gguf"
n_threads = int(os.cpu_count()/2)

In [24]:
llm = LlamaCpp(
        model_path=model_path,
        n_gpu_layers=-1,
        n_batch=1024,
        temperature=0.1,
        n_threads=n_threads,
        max_tokens=8192,
        n_ctx=2048,
        # f16_kv=True,
        # callback_manager=CallbackManager(StreamingStdOutCallbackHandler()),
        verbose=True
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ./models/Meta-Llama-3-8B-Instruct-Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:              llama.att

# Terza Sezione -> Recupero delle Preferenze di un certo User (per ora, uno User a scelta)

In [11]:
import pandas as pd
from inf_retriever import InfRetriever

In [12]:
ir = InfRetriever(userId=1)     # User scelto arbitrariamente
ir.computeUserPreferences(sample_size=10)   

# Il metodo 'computeUserPreferences' esegue le seguenti operazioni:
#   1) Recupera gli ID dei Film a cui lo User ha dato almeno '4.5' come punteggio
#   2) Campiona 'sample_size' di questi Film -> Attenzione! Se il valore di 'sample_size' è maggiore del numero di Film 'candidati'
#       questo automaticamente assume il valore 'int(0.67*len(candidati))'ArithmeticError

In [13]:
titles = ir.getMovieTitles()
titles

['Strada, La (1954)',
 'Lost in Translation (2003)',
 'Pulp Fiction (1994)',
 'Black Cat, White Cat (Crna macka, beli macor) (1998)',
 'Run Lola Run (Lola rennt) (1998)',
 'Teddy Bear (Mis) (1981)',
 'Saragossa Manuscript, The (Rekopis znaleziony w Saragossie) (1965)',
 "Spanish Apartment, The (L'auberge espagnole) (2002)",
 'The Magician (1958)',
 "Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)"]

In [16]:
details = ir.getMovieDetailsDF()
details

Unnamed: 0,title,director,genres,plot,year,cast
7234,The Road,Federico Fellini,Drama,A care-free girl is sold to a traveling entert...,1954,"Anthony Quinn, Giulietta Masina, Richard Basehart"
6711,Lost in Translation,Sofia Coppola,"Comedy, Drama",A faded movie star and a neglected young woman...,2003,"Bill Murray, Scarlett Johansson, Giovanni Ribisi"
296,Pulp Fiction,Quentin Tarantino,"Crime, Drama","The lives of two mob hitmen, a boxer, a gangst...",1994,"John Travolta, Uma Thurman, Samuel L. Jackson"
2843,"Black Cat, White Cat",Emir Kusturica,"Comedy, Crime, Romance",Matko and his son Zare live on the banks of th...,1998,"Bajram Severdzan, Srdjan 'Zika' Todorovic, Bra..."
2692,Run Lola Run,Tom Tykwer,"Action, Crime, Thriller","After a botched money delivery, Lola has 20 mi...",1998,"Franka Potente, Moritz Bleibtreu, Herbert Knaup"
5767,Teddy Bear,Stanislaw Bareja,Comedy,The main character is the manager of a sport c...,1981,"Stanislaw Tym, Barbara Burska, Christine Paul"
2632,The Saragossa Manuscript,Wojciech Has,"Adventure, Comedy, Drama",Upon finding a book that relates his grandfath...,1965,"Zbigniew Cybulski, Iga Cembrzynska, Elzbieta C..."
6370,The Spanish Apartment,Cédric Klapisch,"Comedy, Drama, Romance",A strait-laced French student moves into an ap...,2002,"Romain Duris, Judith Godrèche, Kelly Reilly"
7940,The Magician,Ingmar Bergman,"Comedy, Drama",A traveling magician and his assistants are pe...,1958,"Max von Sydow, Ingrid Thulin, Gunnar Björnstrand"
4973,Amélie,Jean-Pierre Jeunet,"Comedy, Romance","Despite being caught in her imaginative world,...",2001,"Audrey Tautou, Mathieu Kassovitz, Rufus"


In [17]:
details = details[['director', 'genres', 'plot', 'year', 'cast']]

# Quarta Sezione -> Si chiede a Llama di "riassumere" le preferenze del nostro User

In [18]:
from langchain_core.prompts import PromptTemplate

In [41]:
summarization_template = """<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are an helpful AI assistant, which is expert about the field of Movies and Cinema.
I am going to give you a table composed by the following columns: 'director', 'genres', 'plot', 'year', 'cast'.
I give you the task of summarizing the informations contained in each column of the table.

Here are some further instructions:
* The 'director' column contains the name of the director of the movies: report the two most common or famous directors.
* The 'genres' column contains the genres of the movies: report the two or three most common genres.
* The 'plot' column contains the plot of the movies: report the key concepts, events and people in the plots.
* The 'year' column contains the year of the movies: indentify some patterns.
* The 'cast' column contains the cast of the movies: report at most three of four of the most common and famous actors.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Question: {question}
<|eot_id|>

<start_header_id|>assistant<|end_header_id|>
Answer: """

In [42]:
summarization_prompt = PromptTemplate.from_template(summarization_template)

In [43]:
from langchain_core.runnables import RunnablePassthrough

chain = ({"question": RunnablePassthrough()} | summarization_prompt | llm)

In [44]:
request = f"""Summarize the following table: {details}"""
resp = chain.invoke(request)
print(str(resp))

Llama.generate: prefix-match hit

llama_print_timings:        load time =    9907.85 ms
llama_print_timings:      sample time =     153.64 ms /   107 runs   (    1.44 ms per token,   696.43 tokens per second)
llama_print_timings: prompt eval time =    7852.61 ms /   715 tokens (   10.98 ms per token,    91.05 tokens per second)
llama_print_timings:        eval time =   27480.49 ms /   106 runs   (  259.25 ms per token,     3.86 tokens per second)
llama_print_timings:       total time =   36198.87 ms /   821 tokens


 Here is a summary of the table:

**Director**: The directors mentioned in the table are:

* Federico Fellini (7 times)
* Sofia Coppola (1 time)
* Quentin Tarantino (1 time)
* Ingmar Bergman (1 time)

**Actors and Actresses**: Some actors and actresses mentioned in the table are:

* Anthony Quinn
* Bill Murray
* John Travolta
* Franka Potente
* Romain Duris
* Max von Sydow
* Sofia Coppola


# Quinta Sezione -> generazione delle Raccomandazioni

In [45]:
rec_template = """<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are an helpful AI assistant, which is expert about the field of Movies and Cinema.
You have to answer to the request in a precise way and strictly following what is asked: you MUST use the search results to build the answer.
If the asked informations don't exist within the search results, DON'T INVENT THEM AND JUST ANSWER 'I don't know how to answer to this question'. 
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Question: {question}
<|eot_id|>

<start_header_id|>assistant<|end_header_id|>
Answer: """

rec_prompt = PromptTemplate.from_template(rec_template)

In [46]:
retriever = vector_db.as_retriever()

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} | rec_prompt | llm
)

In [47]:
request = """Report me some movies which correspond to at least one of these features: 
* Director: Federico Fellini or Sofia Coppola or Quentin Tarantino or Ingmar Bergman
* Cast: Anthony Quinn or Bill Murray or John Travolta or Audrey Tautou

Give more importance to a movie if it corresponds to more than one of the listed features.
After reporting the movies, explain why you chose them and assign a score (up to 5) to each movie basing on how much it corresponds to the listed features."""

resp = rag_chain.invoke(request)
print(str(resp))

Llama.generate: prefix-match hit

llama_print_timings:        load time =    9907.85 ms
llama_print_timings:      sample time =     331.65 ms /   231 runs   (    1.44 ms per token,   696.51 tokens per second)
llama_print_timings: prompt eval time =    2707.78 ms /   195 tokens (   13.89 ms per token,    72.01 tokens per second)
llama_print_timings:        eval time =   53899.09 ms /   230 runs   (  234.34 ms per token,     4.27 tokens per second)
llama_print_timings:       total time =   58508.94 ms /   425 tokens


 Movies that correspond to at least one of the following features:

* Director: Federico Fellini or Sofia Coppola or Quentin Tarantino or Ingmar Bergman
* Cast: Anthony Quinn or Bill Murray or John Travolta or Audrey Tautou

Here are some movies that correspond to at least one of these features:

1. **8 1/2** (1963) - Directed by Federico Fellini, starring Marcello Mastroianni.
	* Score: 5
2. **Lost in Translation** (2003) - Directed by Sofia Coppola, starring Bill Murray and Scarlett Johansson.
	* Score: 4
3. **Pulp Fiction** (1994) - Directed by Quentin Tarantino, starring John Travolta, Samuel L. Jackson, and Uma Thurman.
	* Score: 5
4. **The Seventh Seal** (1957) - Directed by Ingmar Bergman, starring Max von Sydow and Gunnar Björnstrand.
	* Score: 5

These movies are highly acclaimed for their unique storytelling, cinematography, and direction.


Tutte le informazioni sui Film raccomandati sono corrette (verificate da IMDB):
- 8 1/2 (id=1251) -> https://www.imdb.com/title/tt0056801/
- Lost in Translation (id=6711) -> https://www.imdb.com/title/tt0335266/
- Pulp Fiction (id=296) -> https://www.imdb.com/title/tt0110912/
- The Seventh Seal (id=1237) -> https://www.imdb.com/title/tt0050976/