Extract Dataset from zip file

In [1]:
import os
import zipfile
# Load Data
data_dir = 'ml-32m'

# Check if the dataset folder exists, if not, unzip the dataset
if not os.path.exists(data_dir):
    zip_file_path = 'ml-32m.zip'
    extract_dir = 'ml-32m'

    if not os.path.exists(extract_dir):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Extracted {zip_file_path} to {extract_dir}")
    else:
        print(f"Directory {extract_dir} already exists, skipping extraction.")

Extracted ml-32m.zip to ml-32m


Model 2: Collaborative Filtering

In [3]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# --- 1. Load Data ---
dataset_path = '/content/ml-32m/ml-32m/'
ratings = pd.read_csv(f'{dataset_path}ratings.csv')
movies = pd.read_csv(f'{dataset_path}movies.csv')

# --- 2. Take exactly 2000 samples ---
ratings_sample = ratings.head(2000)

# --- 3. Create Pivot Table ---
# Rows = Movies, Columns = Users
pivot_table = ratings_sample.pivot_table(index='movieId', columns='userId', values='rating').fillna(0)

print(f"Data contains {pivot_table.shape[0]} movies and {pivot_table.shape[1]} users.")

# --- 4. Apply Truncated SVD ---
# FIX: n_components must be <= number of users (features)
# We try for 100, but if the sample is too small, we take the maximum possible.
suggested_factors = 100
max_possible_factors = min(pivot_table.shape[0], pivot_table.shape[1]) - 1
n_factors = min(suggested_factors, max_possible_factors)

print(f"Reducing dimensions to {n_factors} latent factors using SVD...")

svd = TruncatedSVD(n_components=n_factors, random_state=42)
matrix_svd = svd.fit_transform(pivot_table)

# --- 5. Recommendation Function ---
def get_recommendations_fixed(movie_id, k=5):
    # Check if the movie exists in our 2000-row sample
    if movie_id not in pivot_table.index:
        fallback_id = pivot_table.index[0]
        print(f" Movie ID {movie_id} not in sample. Using Movie ID {fallback_id} instead.")
        movie_id = fallback_id

    # Get index of the movie
    movie_idx = list(pivot_table.index).index(movie_id)

    # Extract the SVD latent vector
    movie_vector = matrix_svd[movie_idx].reshape(1, -1)

    # Calculate Similarity
    similarities = cosine_similarity(movie_vector, matrix_svd).flatten()

    # Get top K
    similar_indices = similarities.argsort()[-(k+1):-1][::-1]
    return pivot_table.index[similar_indices]

# --- 6. Run Test ---
target_movie_id = 1 # Toy Story
recommended_ids = get_recommendations_fixed(target_movie_id)

print(f"\nSVD-based recommendations for Movie ID {target_movie_id}:")
for mid in recommended_ids:
    title = movies[movies['movieId'] == mid]['title'].values[0]
    print(f"- {title}")

Data contains 1286 movies and 19 users.
Reducing dimensions to 18 latent factors using SVD...

SVD-based recommendations for Movie ID 1:
- Star Wars: Episode VI - Return of the Jedi (1983)
- Rambo III (1988)
- Star Wars: Episode II - Attack of the Clones (2002)
- First Blood (Rambo: First Blood) (1982)
- Exorcist, The (1973)


In [4]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl size=2554967 sha256=375df93882f387d62030da60fb9773daf7ccfab4c2776692e5fe307b3d8717d2
  Stored in directory: /root/.cache

In [6]:
# 1. Downgrade NumPy to a version compatible with scikit-surprise
!pip install "numpy<2"

# 2. Reinstall surprise to ensure it links correctly to the downgraded NumPy
!pip install scikit-surprise

Collecting numpy<2
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; pyth

