Movie Recommendation Notebook
-----------------------------
This notebook lets you input a movie preference prompt (e.g., "I want to watch a sci-fi romance with strong female leads")
and returns a top-5 list of recommended movies based on dataset embeddings and similarity search.

# 1. Install and import necessary libraries

In [None]:
!pip install pandas scikit-learn sentence-transformers faiss-cpu wget unzip
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
import faiss
import wget 
import os
import wget
import zipfile


Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting unzip
  Downloading unzip-1.0.0.tar.gz (704 bytes)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: wget, unzip
  Building wheel for wget (pyproject.toml): started
  Building wheel for wget (pyproject.toml): finished with status 'done'
  Created wheel for wget: filename=wget-3.2-py3-non


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\Gabi\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 2. Cargar Dataset desde la web
# Descarga automática del zip con el dataset
if not os.path.exists('ml-1m') and not os.path.exists('ml-1m.zip'):
    wget.download('https://files.grouplens.org/datasets/movielens/ml-1m.zip', 'ml-1m.zip')
    with zipfile.ZipFile('ml-1m.zip', 'r') as zip_ref:
        zip_ref.extractall('ml-1m')
else:
    print("Dataset ya descargado y descomprimido.")
# Lectura de archivos
import pandas as pd

In [5]:
# 2. Load Dataset
# Recommended datasets:
# - MovieLens 1M: https://grouplens.org/datasets/movielens/1m/
# - MovieLens 20M: https://grouplens.org/datasets/movielens/20m/
# - IMDb Titles and Ratings: https://datasets.imdbws.com/

# Dataset Recommendations
# - MovieLens 1M: ~1M ratings, 3K movies, easy to start
# - MovieLens 20M: richer, but heavier compute
# - IMDb Basic Title Dataset: for richer metadata and plots
# - TMDb API: for additional metadata and posters



movies = pd.read_csv('ml-1m/ml-1m/movies.dat', sep='::', engine='python',
                     names=['MovieID','Title','Genres'], encoding='latin1')
ratings = pd.read_csv('ml-1m/ml-1m/ratings.dat', sep='::', engine='python',
                      names=['UserID','MovieID','Rating','Timestamp'], encoding='latin1')
users = pd.read_csv('ml-1m/ml-1m/users.dat', sep='::', engine='python',
                    names=['UserID','Gender','Age','Occupation','Zip-code'], encoding='latin1')

# Comprobación rápida
print(movies.head())
print(ratings.head())
print(users.head())

FileNotFoundError: [Errno 2] No such file or directory: 'ml-1m/ml-1m/movies.dat'

In [None]:
# 3. Preprocess Data
# Extract year, clean titles
movies['Year'] = movies['Title'].str.extract(r"\((\d{4})\)").astype(float)
movies['CleanTitle'] = movies['Title'].str.replace(r"\(\d{4}\)", "", regex=True).str.strip()

In [None]:
# 4. Feature Construction: average rating and genre one-hot
avg_ratings = ratings.groupby('MovieID')['Rating'].mean().reset_index().rename(columns={'Rating':'AvgRating'})
movies = movies.merge(avg_ratings, on='MovieID', how='left')
movies['AvgRating'].fillna(movies['AvgRating'].mean(), inplace=True)

genres_expanded = movies['Genres'].str.get_dummies(sep='|')
movies = pd.concat([movies, genres_expanded], axis=1)

In [None]:
# 5. Text Embeddings on descriptions: use title + genres
model = SentenceTransformer('all-MiniLM-L6-v2')
movies['Text'] = movies['CleanTitle'] + ' | ' + movies['Genres']
embeddings = model.encode(movies['Text'].tolist(), show_progress_bar=True)

In [None]:
# 6. Build FAISS index
d = embeddings.shape[1]
index = faiss.IndexFlatIP(d)       # inner product for cosine similarity after normalization
faiss.normalize_L2(embeddings)
index.add(embeddings)

In [None]:
# 7. Recommendation function

def recommend_movies(prompt, k=5):
    # Embed prompt
    q_emb = model.encode([prompt])
    faiss.normalize_L2(q_emb)
    # Search
    distances, indices = index.search(q_emb, k)
    recs = movies.iloc[indices[0]][['CleanTitle','Genres','AvgRating','Year']]
    return recs

# 8. User Interaction
if __name__ == '__main__':
    user_prompt = input("Describe what type of movie you want: ")
    recommendations = recommend_movies(user_prompt, k=5)
    print("Top 5 recommendations based on your prompt:\n")
    print(recommendations.to_string(index=False))