# Step 1: Import necessary Librabries

In [None]:
!pip install transformers -U -q
!pip install sentence-transformers -q

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-huggingface 0.0.3 requires langchain-core<0.3,>=0.1.52, but you have langchain-core 0.3.74 which is incompatible.


In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange





# Step 2: Load Dataset (movies dataset)

In [3]:
path = "imdb_movies.csv"
movies = pd.read_csv(path)   # replace with your file
movies = movies.dropna(subset=["names", "overview"])  # ensure text available

print("Dataset shape:", movies.shape)
movies

Dataset shape: (10178, 12)


Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,2.716167e+08,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2.316795e+09,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,7.244590e+08,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,3.420000e+07,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,3.409420e+08,US
...,...,...,...,...,...,...,...,...,...,...,...,...
10173,20th Century Women,12/28/2016,73.0,Drama,"In 1979 Santa Barbara, California, Dorothea Fi...","Annette Bening, Dorothea Fields, Lucas Jade Zu...",20th Century Women,Released,English,7000000.0,9.353729e+06,US
10174,Delta Force 2: The Colombian Connection,08/24/1990,54.0,Action,When DEA agents are taken captive by a ruthles...,"Chuck Norris, Col. Scott McCoy, Billy Drago, R...",Delta Force 2: The Colombian Connection,Released,English,9145817.8,6.698361e+06,US
10175,The Russia House,12/21/1990,61.0,"Drama, Thriller, Romance","Barley Scott Blair, a Lisbon-based editor of R...","Sean Connery, Bartholomew 'Barley' Scott Blair...",The Russia House,Released,English,21800000.0,2.299799e+07,US
10176,Darkman II: The Return of Durant,07/11/1995,55.0,"Action, Adventure, Science Fiction, Thriller, ...",Darkman and Durant return and they hate each o...,"Larry Drake, Robert G. Durant, Arnold Vosloo, ...",Darkman II: The Return of Durant,Released,English,116000000.0,4.756613e+08,US


# Step 3: Load Pretrained Sentence Transformer Model

In [4]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 4: Encode Movie Overviews


In [5]:
movie_embeddings = model.encode(
                                movies["overview"].tolist(),
                                convert_to_tensor=True,
                                show_progress_bar=True
                                )

print("Embeddings shape:", movie_embeddings.shape)

Batches:   0%|          | 0/319 [00:00<?, ?it/s]

Embeddings shape: torch.Size([10178, 384])


# Step 5: Build Recommendation Function

In [6]:
def recommend(movie_title, top_k=5):
    # find index
    idx = movies[movies["names"].str.lower() == movie_title.lower()].index
    if len(idx) == 0:
        return f"Movie '{movie_title}' not found."
    idx = idx[0]

    query_embedding = movie_embeddings[idx]

    # cosine similarity
    scores = util.cos_sim(query_embedding, movie_embeddings)[0].cpu().numpy()
    # top results
    top_indices = np.argsort(scores)[::-1][1:top_k+1]  # skip the movie itself

    # Usa "names" invece di "title" perché quella è la colonna che esiste nel dataset
    recommendations = movies.iloc[top_indices][["names", "overview"]]
    return recommendations

In [7]:
# Prima controlliamo le colonne disponibili nel dataset
print("Colonne disponibili nel dataset:")
print(movies.columns.tolist())
print("\n" + "="*50 + "\n")

# Example
print(recommend("Mummies"))

Colonne disponibili nel dataset:
['names', 'date_x', 'score', 'genre', 'overview', 'crew', 'orig_title', 'status', 'orig_lang', 'budget_x', 'revenue', 'country']


                        names  \
4835  7 Guardians of the Tomb   
412    The Mummy Resurrection   
8528     Flowers in the Attic   
7692                  The Dig   
6850       Death at a Funeral   

                                               overview  
4835  An innocent discovery of a well-preserved mumm...  
412   When an infamous "cursed" Egyptian sarcophagus...  
8528  After the death of her husband, a mother takes...  
7692  As WWII looms, a wealthy widow hires an amateu...  
6850  A myriad of outrageous calamities befalls an e...  


# Step 6: Save Model and Data in Pickle Format

In [8]:
import joblib
import os

pickle_dir = "pickle_model"
os.makedirs(pickle_dir, exist_ok=True)

print(f"Directory '{pickle_dir}' created/verified successfully!")

Directory 'pickle_model' created/verified successfully!


In [9]:
# 1. Salvare il modello SentenceTransformer
model_path = os.path.join(pickle_dir, "sentence_transformer_model.pkl")
joblib.dump(model, model_path)
print(f"✅ Modello SentenceTransformer salvato in: {model_path}")

# 2. Salvare gli embeddings dei film
embeddings_path = os.path.join(pickle_dir, "movie_embeddings.pkl")
joblib.dump(movie_embeddings, embeddings_path)
print(f"✅ Embeddings dei film salvati in: {embeddings_path}")

# 3. Salvare il dataset dei film
movies_path = os.path.join(pickle_dir, "movies_dataset.pkl")
joblib.dump(movies, movies_path)
print(f"✅ Dataset dei film salvato in: {movies_path}")

✅ Modello SentenceTransformer salvato in: pickle_model\sentence_transformer_model.pkl
✅ Embeddings dei film salvati in: pickle_model\movie_embeddings.pkl
✅ Dataset dei film salvato in: pickle_model\movies_dataset.pkl
✅ Embeddings dei film salvati in: pickle_model\movie_embeddings.pkl
✅ Dataset dei film salvato in: pickle_model\movies_dataset.pkl


In [11]:
# 4. Salvare tutto in un singolo file per facilità di caricamento
complete_model_data = {
                    'model': model,
                    'embeddings': movie_embeddings,
                    'movies_data': movies,
                    'recommend_function': recommend
                    }

complete_path = os.path.join(pickle_dir, "complete_recommendation_system.pkl")
joblib.dump(complete_model_data, complete_path)
print(f"Sistema completo di raccomandazione salvato in: {complete_path}")


Sistema completo di raccomandazione salvato in: pickle_model\complete_recommendation_system.pkl


# Step 7: Come Ricaricare il Modello (Esempio)

In [12]:
# Esempio di come ricaricare il sistema completo con joblib
def load_recommendation_system(pickle_dir="pickle_model"):
    """
    Carica il sistema di raccomandazione completo da file joblib
    """
    complete_path = os.path.join(pickle_dir, "complete_recommendation_system.pkl")
    
    data = joblib.load(complete_path)
    
    return data['model'], data['embeddings'], data['movies_data'], data['recommend_function']

# Funzione alternativa per caricare i componenti singolarmente
def load_components_separately(pickle_dir="pickle_model"):
    """
    Carica i componenti singolarmente per maggiore flessibilità
    """
    model_path = os.path.join(pickle_dir, "sentence_transformer_model.pkl")
    embeddings_path = os.path.join(pickle_dir, "movie_embeddings.pkl")
    movies_path = os.path.join(pickle_dir, "movies_dataset.pkl")
    
    model = joblib.load(model_path)
    embeddings = joblib.load(embeddings_path)
    movies_data = joblib.load(movies_path)
    
    return model, embeddings, movies_data

# Per usarlo in futuro:
# loaded_model, loaded_embeddings, loaded_movies, loaded_recommend_func = load_recommendation_system()
# recommendations = loaded_recommend_func("Avatar")

In [None]:
loaded_model, loaded_embeddings, loaded_movies, loaded_recommend_func = load_recommendation_system()
recommendations = loaded_recommend_func("Avatar")
recommendations