In [1]:
import os
import torch
from datetime import datetime

In [2]:
# Inizializzazione del Device
device = None
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Device:", torch.cuda.get_device_name(0))
else:
    device = torch.device('cpu')
    print("Device: CPU")

Device: NVIDIA GeForce RTX 3060


# Prima Sezione -> Interazione con Milvus

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.milvus import Milvus
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
# Questo Blocco di Codice serve per effettuare lo Splitting dei Documenti memorizzati in "movie_text_data"
docs = list()

start = datetime.now()

for file in os.listdir('./movie_text_data/'):
    loader = TextLoader(f'./movie_text_data/{file}')
    docs.extend(loader.load())

print("Number of Documents:", len(docs))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)  # DA OTTIMIZZARE
all_splits = text_splitter.split_documents(docs)

print("Required Time:", datetime.now() - start)

Number of Documents: 58022
Required Time: 0:07:07.848427


In [5]:
# Inizializzazione del Modello di Embedding -> all-MiniLM-L12-v2 (HuggingFace, len(Spazio Latente)=384)
model_name = "sentence-transformers/all-MiniLM-L12-v2"
model_kwargs = {'device': device}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [6]:
# Connessione a Milvus (deve essere prima fatto partire il Container su Docker!)
vector_db = Milvus(
    embedding_function=embeddings,
    connection_args={'host':'127.0.0.1', 'port':'19530'},
    collection_name="MovieTextData",
    drop_old=False
)

In [6]:
# Attenzione!!! Il seguente Blocco di Codice reinizializza Milvus ed esegue il Caricamento dei Documenti memorizzati in 'all_splits'ArithmeticError
start = datetime.now()

vector_db = Milvus.from_documents(
    documents=all_splits,
    embedding=embeddings,
    connection_args={'host':'127.0.0.1', 'port':'19530'},
    collection_name="MovieTextData",
    drop_old=True
)

print("Required Time:", datetime.now() - start)

Required Time: 0:01:30.029326


In [7]:
# Esegui il seguente Blocco di Codice per ottenere informazioni sulla Collezione in Milvus
print(f"Default collection name - {vector_db.collection_name}")
print(f"Default search params - {vector_db.search_params}")
print(f"Default index params - {vector_db.index_params}")

Default collection name - MovieTextData
Default search params - {'metric_type': 'L2', 'params': {'ef': 10}}
Default index params - None


# Seconda Sezione -> Inizializzazione di Llama3-8b-Instruct

In [8]:
from langchain_community.llms import LlamaCpp

In [9]:
model_path = "./models/Meta-Llama-3-8B-Instruct-Q6_K.gguf"
n_threads = int(os.cpu_count()/2)

In [10]:
llm = LlamaCpp(
        model_path=model_path,
        n_gpu_layers=-1,
        n_batch=1024,
        temperature=0.1,
        n_threads=n_threads,
        max_tokens=8192,
        n_ctx=2048,
        # f16_kv=True,
        # callback_manager=CallbackManager(StreamingStdOutCallbackHandler()),
        verbose=True
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ./models/Meta-Llama-3-8B-Instruct-Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:              llama.att

# Terza Sezione -> Recupero delle Preferenze di un certo User (per ora, uno User a scelta)

In [11]:
import pandas as pd
from inf_retriever import InfRetriever

In [12]:
ir = InfRetriever(userId=24)     # User scelto arbitrariamente
ir.computeUserPreferences(sample_size=100)   

# Il metodo 'computeUserPreferences' esegue le seguenti operazioni:
#   1) Recupera gli ID dei Film a cui lo User ha dato almeno '4.5' come punteggio
#   2) Campiona 'sample_size' di questi Film -> Attenzione! Se il valore di 'sample_size' è maggiore del numero di Film 'candidati'
#       questo automaticamente assume il valore 'int(0.67*len(candidati))'

In [13]:
titles = ir.getMovieTitles()
titles

['Usual Suspects, The (1995)',
 'Shawshank Redemption, The (1994)',
 "Schindler's List (1993)",
 'Jungle Book, The (1967)',
 'Fight Club (1999)',
 'Gone in 60 Seconds (2000)',
 'Dark Knight, The (2008)']

In [14]:
details = ir.getMovieDetailsDF()
details

Unnamed: 0,title,director,genres,plot,year,cast
50,The Usual Suspects,Bryan Singer,"Crime, Drama, Mystery",The sole survivor of a pier shoot-out tells th...,1995,"Kevin Spacey, Gabriel Byrne, Chazz Palminteri"
318,The Shawshank Redemption,Frank Darabont,Drama,"Over the course of several years, two convicts...",1994,"Tim Robbins, Morgan Freeman, Bob Gunton"
527,Schindler's List,Steven Spielberg,"Biography, Drama, History","In German-occupied Poland during World War II,...",1993,"Liam Neeson, Ralph Fiennes, Ben Kingsley"
2078,The Jungle Book,Wolfgang Reitherman,"Animation, Adventure, Comedy",Bagheera the Panther and Baloo the Bear have a...,1967,"Phil Harris, Sebastian Cabot, Louis Prima"
2959,Fight Club,David Fincher,Drama,An insomniac office worker and a devil-may-car...,1999,"Brad Pitt, Edward Norton, Meat Loaf"
3717,Gone in 60 Seconds,Dominic Sena,"Action, Crime, Thriller",A retired master car thief must come back to t...,2000,"Nicolas Cage, Angelina Jolie, Giovanni Ribisi"
58559,The Dark Knight,Christopher Nolan,"Action, Crime, Drama",When the menace known as the Joker wreaks havo...,2008,"Christian Bale, Heath Ledger, Aaron Eckhart"


In [15]:
titles = details['title'].tolist()
details = details[['director', 'genres', 'plot', 'year', 'cast']]

# Quarta Sezione -> Si chiede a Llama di "riassumere" le preferenze del nostro User

In [16]:
from langchain_core.prompts import PromptTemplate

In [31]:
summarization_template = """<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are an helpful AI assistant, which is expert about the field of Movies and Cinema.
I am going to give you a table composed by the following columns: 'director', 'genres', 'plot', 'year', 'cast'.
I give you the task of summarizing the informations contained in each column of the table.

Here are some further instructions:
* The 'director' column contains the name of the director of the movies: report the two most common directors.
* The 'genres' column contains the genres of the movies: report the two or three most common genres.
* The 'plot' column contains the plot of the movies: report the key concepts, events and people in the plots.
* The 'year' column contains the year of the movies: indentify some patterns.
* The 'cast' column contains the cast of the movies: report not more than three of four of the most common actors.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Question: {question}
<|eot_id|>

<start_header_id|>assistant<|end_header_id|>
Answer: """

In [32]:
summarization_prompt = PromptTemplate.from_template(summarization_template)

In [44]:
from langchain_core.runnables import RunnablePassthrough

chain = ({"question": RunnablePassthrough()} | summarization_prompt | llm)

In [45]:
request = f"""Summarize the following table: {details}"""
resp = chain.invoke(request)

Llama.generate: prefix-match hit

llama_print_timings:        load time =   10920.70 ms
llama_print_timings:      sample time =     243.47 ms /   166 runs   (    1.47 ms per token,   681.80 tokens per second)
llama_print_timings: prompt eval time =     779.45 ms /   552 tokens (    1.41 ms per token,   708.20 tokens per second)
llama_print_timings:        eval time =    4516.06 ms /   165 runs   (   27.37 ms per token,    36.54 tokens per second)
llama_print_timings:       total time =    6650.16 ms /   717 tokens


In [46]:
print(str(resp))

 Here is a summary of the table:

**Director:** The two most common directors are Bryan Singer and Frank Darabont.

**Genre:** The three most common genres are Crime, Drama, and Mystery.

**Plot:** Some key concepts, events, and people in the plots include pier shoot-outs, prison life, World War II, and a battle between good and evil.

**Year:** The years represented in the table range from 1967 to 2008.

**Cast:** Some notable actors who have appeared in these movies include Kevin Spacey, Gabriel Byrne, Tim Robbins, Morgan Freeman, Liam Neeson, Ralph Fiennes, Ben Kingsley, Brad Pitt, Edward Norton, Meat Loaf, Nicolas Cage, Angelina Jolie, Giovanni Ribisi, Christian Bale, Heath Ledger, and Aaron Eckhart.


# Quinta Sezione -> generazione delle Raccomandazioni

In [47]:
rec_template = """<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are an helpful AI assistant, which is expert about the field of Movies and Cinema.
You have to answer to the request in a precise way and strictly following what is asked: you MUST use the search results to build the answer.
If the asked informations don't exist within the search results, DON'T INVENT THEM AND JUST ANSWER 'I don't know how to answer to this question'. 
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Question: {question}
<|eot_id|>

<start_header_id|>assistant<|end_header_id|>
Answer: """

rec_prompt = PromptTemplate.from_template(rec_template)

In [48]:
retriever = vector_db.as_retriever()

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} | rec_prompt | llm
)

In [51]:
request = f"""Report me some movies which correspond to at least one of these features: 
* Director: Bryan Singer or Frank Darabont 
* Genre: Crime, Drama, Mystery
* Plot: 'shoot-outs', 'prison life', 'World War II', 'battle between good and evil'
* Cast: Kevin Spacey or Morgan Freeman or Liam Neeson or Ralph Fiennes or Ben Kingsley

Give more importance to a movie if it corresponds to more than one of the listed features.
After reporting the movies, explain why you chose them.

You must not report the movies whose titles are in the following list: {titles}"""

resp = rag_chain.invoke(request)

Llama.generate: prefix-match hit

llama_print_timings:        load time =   10920.70 ms
llama_print_timings:      sample time =     316.65 ms /   219 runs   (    1.45 ms per token,   691.62 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    5682.96 ms /   219 runs   (   25.95 ms per token,    38.54 tokens per second)
llama_print_timings:       total time =    7494.70 ms /   220 tokens


In [52]:
print(str(resp))

 Movies that correspond to at least one of the listed features:

1. **X-Men** (2000) - Director: Bryan Singer; Genre: Superhero, Action; Plot: Battle between good and evil; Cast: Hugh Jackman, Patrick Stewart, Ian McKellen.
2. **The Green Mile** (1999) - Director: Frank Darabont; Genre: Drama, Fantasy; Plot: Prison life; Cast: Tom Hanks, Michael Clarke Duncan, Sam Rockwell.
3. **Saving Private Ryan** (1998) - Director: Steven Spielberg; Genre: War, Drama; Plot: World War II; Cast: Tom Hanks, Matt Damon, Vin Diesel.

I chose these movies because they correspond to at least one of the listed features. For example, **X-Men** corresponds to the feature "Director: Bryan Singer" and also to the feature "Genre: Superhero, Action". Similarly, **The Green Mile** corresponds to the feature "Director: Frank Darabont" and also to the feature "Genre: Drama, Fantasy".


In [53]:
titles

['The Usual Suspects',
 'The Shawshank Redemption',
 "Schindler's List",
 'The Jungle Book',
 'Fight Club',
 'Gone in 60 Seconds',
 'The Dark Knight']

Tutte le informazioni sui Film raccomandati sono corrette (verificate da IMDB):
- 8 1/2 (id=1251) -> https://www.imdb.com/title/tt0056801/
- Lost in Translation (id=6711) -> https://www.imdb.com/title/tt0335266/
- Pulp Fiction (id=296) -> https://www.imdb.com/title/tt0110912/
- The Seventh Seal (id=1237) -> https://www.imdb.com/title/tt0050976/