# 1. Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import joblib
import os

# 2. Load dataset

In [4]:
movies = pd.read_csv("imdb_movies.csv")

movies = movies[["names", "overview"]]
movies = movies.dropna(subset=["overview"])
movies.reset_index(drop=True, inplace=True)

print("✅ Dataset loaded with shape:", movies.shape)
movies


✅ Dataset loaded with shape: (10178, 2)


Unnamed: 0,names,overview
0,Creed III,"After dominating the boxing world, Adonis Cree..."
1,Avatar: The Way of Water,Set more than a decade after the events of the...
2,The Super Mario Bros. Movie,"While working underground to fix a water main,..."
3,Mummies,"Through a series of unfortunate events, three ..."
4,Supercell,Good-hearted teenager William always lived in ...
...,...,...
10173,20th Century Women,"In 1979 Santa Barbara, California, Dorothea Fi..."
10174,Delta Force 2: The Colombian Connection,When DEA agents are taken captive by a ruthles...
10175,The Russia House,"Barley Scott Blair, a Lisbon-based editor of R..."
10176,Darkman II: The Return of Durant,Darkman and Durant return and they hate each o...


# 3. TF-IDF Vectorization

In [5]:
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)

# Fit TF-IDF on movie overviews
tfidf_matrix = tfidf.fit_transform(movies["overview"])

print("✅ TF-IDF matrix shape:", tfidf_matrix.shape)

✅ TF-IDF matrix shape: (10178, 5000)


# 4. Train KNN model

In [6]:
knn = NearestNeighbors(n_neighbors=16, metric="cosine", algorithm="brute")
knn.fit(tfidf_matrix)

# 5. Test recommendation

In [7]:
def recommend(movie_title):
    if movie_title not in movies["names"].values:
        return f"❌ Movie '{movie_title}' not found in dataset."
    
    idx = movies[movies["names"] == movie_title].index[0]
    distances, indices = knn.kneighbors(tfidf_matrix[idx], n_neighbors=6)
    
    print(f"🎬 Recommendations for '{movie_title}':")
    for i in indices.flatten()[1:]:  # skip the first (same movie)
        print("-", movies.iloc[i]["names"])

# Example Test
recommend("The Dark Knight")

🎬 Recommendations for 'The Dark Knight':
- Batman: The Long Halloween, Part Two
- Batman: The Long Halloween, Part One
- The Dark Knight Rises
- Batman
- Batman Forever


# 6. Save model + vectorizer + dataset

In [8]:
os.makedirs("pickle_model", exist_ok=True)

joblib.dump(knn, "pickle_model/knn_model.pkl")
joblib.dump(tfidf, "pickle_model/tfidf_vectorizer.pkl")

print("Model, vectorizer, and metadata saved in pickle_model/")

Model, vectorizer, and metadata saved in pickle_model/


# 7. Test Inference - Load model and test recommendations

In [9]:

# Carica il modello e vectorizer salvati
loaded_knn = joblib.load("pickle_model/knn_model.pkl")
loaded_tfidf = joblib.load("pickle_model/tfidf_vectorizer.pkl")

print("✅ Model and vectorizer loaded successfully!")

# Test con il dataset già caricato (in produzione dovresti ricaricare anche i dati)
def recommend_inference(movie_title, top_k=5):
    """Funzione di raccomandazione usando il modello caricato"""
    if movie_title not in movies["names"].values:
        return f"❌ Movie '{movie_title}' not found in dataset."
    
    # Trova l'indice del film
    idx = movies[movies["names"] == movie_title].index[0]
    
    # Ottieni il vettore TF-IDF del film
    movie_vector = loaded_tfidf.transform([movies.iloc[idx]["overview"]])
    
    # Trova i film più simili
    distances, indices = loaded_knn.kneighbors(movie_vector, n_neighbors=top_k+1)
    
    print(f"🎬 Top {top_k} recommendations for '{movie_title}':")
    recommendations = []
    for i, (distance, idx_rec) in enumerate(zip(distances.flatten()[1:], indices.flatten()[1:])):
        movie_name = movies.iloc[idx_rec]["names"]
        similarity = 1 - distance  # Convert distance to similarity
        print(f"{i+1}. {movie_name} (similarity: {similarity:.3f})")
        recommendations.append((movie_name, similarity))
    
    return recommendations

# Test con "Mummies"
print("\n" + "="*60)
mummies_recs = recommend_inference("Mummies")
print("="*60)

✅ Model and vectorizer loaded successfully!

🎬 Top 5 recommendations for 'Mummies':
1. The Lord of the Rings (similarity: 0.322)
2. The Addams Family 2 (similarity: 0.219)
3. Paddington 2 (similarity: 0.169)
4. Balto II: Wolf Quest (similarity: 0.165)
5. Brother Bear 2 (similarity: 0.155)


In [10]:
# Test con altri film per verificare che il modello funzioni bene
print("🧪 Testing with different movies:\n")

test_movies = ["Avatar", "The Dark Knight", "Titanic"]

for movie in test_movies:
    if movie in movies["names"].values:
        print(f"🎬 Recommendations for '{movie}':")
        recs = recommend_inference(movie, top_k=3)
        print("-" * 40)
    else:
        print(f"❌ '{movie}' not found in dataset")
        print("-" * 40)

🧪 Testing with different movies:

🎬 Recommendations for 'Avatar':
🎬 Top 3 recommendations for 'Avatar':
1. The American (similarity: 0.216)
2. Apollo 18 (similarity: 0.208)
3. JUNG_E (similarity: 0.204)
----------------------------------------
🎬 Recommendations for 'The Dark Knight':
🎬 Top 3 recommendations for 'The Dark Knight':
1. Batman: The Long Halloween, Part Two (similarity: 0.372)
2. Batman: The Long Halloween, Part One (similarity: 0.364)
3. The Dark Knight Rises (similarity: 0.326)
----------------------------------------
🎬 Recommendations for 'Titanic':
🎬 Top 3 recommendations for 'Titanic':
1. The Legend of 1900 (similarity: 0.351)
2. Silent Hill (similarity: 0.293)
3. Death Ship (similarity: 0.254)
----------------------------------------


# 8. Complete Inference Function for Production

In [11]:
# Funzione completa per l'inference in produzione
def load_and_recommend(movie_title, model_dir="pickle_model", top_k=5):
    """
    Funzione completa per caricare il modello e fare raccomandazioni.
    Questa funzione può essere usata in produzione (es. in Streamlit).
    """
    try:
        # Carica modello e vectorizer
        knn_model = joblib.load(f"{model_dir}/knn_model.pkl")
        tfidf_vectorizer = joblib.load(f"{model_dir}/tfidf_vectorizer.pkl")
        
        # Carica dataset (in produzione potresti salvare anche questo)
        movies_df = pd.read_csv("imdb_movies.csv")[["names", "overview"]].dropna()
        
        # Cerca il film
        if movie_title not in movies_df["names"].values:
            return None, f"Movie '{movie_title}' not found"
        
        # Trova l'indice
        idx = movies_df[movies_df["names"] == movie_title].index[0]
        
        # Trasforma l'overview del film con TF-IDF
        movie_vector = tfidf_vectorizer.transform([movies_df.iloc[idx]["overview"]])
        
        # Trova raccomandazioni
        distances, indices = knn_model.kneighbors(movie_vector, n_neighbors=top_k+1)
        
        # Prepara risultati
        recommendations = []
        for distance, idx_rec in zip(distances.flatten()[1:], indices.flatten()[1:]):
            movie_name = movies_df.iloc[idx_rec]["names"]
            similarity = 1 - distance
            recommendations.append({
                "title": movie_name,
                "similarity": similarity
            })
        
        return recommendations, None
        
    except Exception as e:
        return None, f"Error: {str(e)}"

# Test della funzione completa
print("🚀 Testing complete inference function:")
print("="*50)

recs, error = load_and_recommend("Mummies", top_k=5)
if error:
    print(f"❌ {error}")
else:
    print(f"✅ Found {len(recs)} recommendations for 'Mummies':")
    for i, rec in enumerate(recs, 1):
        print(f"{i}. {rec['title']} (similarity: {rec['similarity']:.3f})")

print("="*50)

🚀 Testing complete inference function:
✅ Found 5 recommendations for 'Mummies':
1. The Lord of the Rings (similarity: 0.322)
2. The Addams Family 2 (similarity: 0.219)
3. Paddington 2 (similarity: 0.169)
4. Balto II: Wolf Quest (similarity: 0.165)
5. Brother Bear 2 (similarity: 0.155)
