In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import urllib.request
import zipfile
import os
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Implemented because I am on a laptop without EasyRec
def download_movielens():
    url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
    zip_path = "ml-100k.zip"
    
    if not os.path.exists("ml-100k"):
        print("Downloading MovieLens 100k dataset...")
        urllib.request.urlretrieve(url, zip_path)
        
        print("Extracting files...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall()
            
        # Remove zip file
        os.remove(zip_path)
        print("Download complete!")
    else:
        print("Dataset already exists!")

download_movielens()

Downloading MovieLens 100k dataset...
Extracting files...
Download complete!


In [6]:
# Load the movies data
movies_df = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', 
                        names=['movie_id', 'title', 'release_date', 'video_release_date',
                              'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
                              'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                              'Thriller', 'War', 'Western'])

# Display first few rows
print("Number of movies:", len(movies_df))
movies_df[['movie_id', 'title']].head(3)

Number of movies: 1682


Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)


In [4]:
# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Get all movie titles
titles = movies_df['title'].tolist()

# Compute embeddings
print("Computing embeddings...")
embeddings = model.encode(titles, show_progress_bar=True)

# Create a DataFrame with movie IDs, titles, and their embeddings
embeddings_df = pd.DataFrame({
    'movie_id': movies_df['movie_id'],
    'title': movies_df['title'],
    'embedding': list(embeddings)
})

print("\nShape of embeddings:", embeddings.shape)
print("\nFirst movie title and its embedding shape:")
print(f"Title: {embeddings_df['title'].iloc[0]}")
print(f"Embedding shape: {len(embeddings_df['embedding'].iloc[0])}")



Computing embeddings...


Batches: 100%|██████████| 53/53 [00:00<00:00, 71.97it/s]


Shape of embeddings: (1682, 384)

First movie title and its embedding shape:
Title: Toy Story (1995)
Embedding shape: 384





In [None]:
#Find similar movies using cosine similarity

def find_similar_movies(title, n=5):
    # Get the embedding for the input title
    movie_idx = embeddings_df[embeddings_df['title'] == title].index[0]
    movie_embedding = embeddings[movie_idx]
    
    # Compute similarities
    similarities = cosine_similarity([movie_embedding], embeddings)[0]
    
    # Get top N similar movies
    similar_indices = similarities.argsort()[-n-1:][::-1][1:]
    
    return [(embeddings_df['title'].iloc[idx], similarities[idx]) 
            for idx in similar_indices]

# Example: Find movies similar to Little Princess
movie_title = "Little Princess, A (1995)"
similar_movies = find_similar_movies(movie_title)

print(f"Movies similar to '{movie_title}':")
for title, score in similar_movies:
    print(f"{title}: {score:.3f}")

Movies similar to 'Little Princess, A (1995)':
Princess Caraboo (1994): 0.702
Swan Princess, The (1994): 0.683
Little Princess, The (1939): 0.674
Little Women (1994): 0.613
Princess Bride, The (1987): 0.612
