In [None]:
# Cell 0 — Install required packages
!pip install -q --upgrade numpy==1.26.4
!pip install -q --upgrade pandas
!pip install -q --upgrade scipy
!pip install -q --upgrade scikit-learn
!pip install -q kaggle gradio==3.40.1

print("✅ Packages installed. Proceed to next cell to upload kaggle.json (for books/anime/songs).")


✅ Packages installed. Proceed to next cell to upload kaggle.json (for books/anime/songs).


In [None]:
# Cell 1 — Upload kaggle.json and download datasets
from google.colab import files
import os, sys, time
import requests, zipfile, io

print("📂 Please upload your kaggle.json now.")
uploaded = files.upload()

os.makedirs("/root/.kaggle", exist_ok=True)
for filename in uploaded.keys():
    if 'kaggle' in filename:
        open('/root/.kaggle/kaggle.json','wb').write(uploaded[filename])
        os.chmod('/root/.kaggle/kaggle.json', 0o600)

# Create data folders
os.makedirs("data/movies", exist_ok=True)
os.makedirs("data/books", exist_ok=True)
os.makedirs("data/anime", exist_ok=True)
os.makedirs("data/songs", exist_ok=True)

# Download Kaggle datasets
print("⬇️ Downloading Kaggle datasets...")
!kaggle datasets download -d arashnic/book-recommendation-dataset -p data/books --unzip -q || true
!kaggle datasets download -d cooperunion/anime-recommendations-database -p data/anime --unzip -q || true
!kaggle datasets download -d notshrirang/spotify-million-song-dataset -p data/songs --unzip -q || true

# MovieLens dataset direct download
movielens_url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
movielens_dir = "data/movies/ml-latest-small"
print(f"\n⬇️ Downloading MovieLens dataset from {movielens_url}...")
try:
    r = requests.get(movielens_url, stream=True)
    r.raise_for_status()
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall("data/movies/")
    print("✅ MovieLens dataset downloaded and extracted.")
except Exception as e:
    print(f"❌ MovieLens download failed: {e}")


📂 Please upload your kaggle.json now.


Saving kaggle.json to kaggle.json
⬇️ Downloading Kaggle datasets...
Dataset URL: https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset
License(s): CC0-1.0
Dataset URL: https://www.kaggle.com/datasets/cooperunion/anime-recommendations-database
License(s): CC0-1.0
Dataset URL: https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset
License(s): CC0-1.0

⬇️ Downloading MovieLens dataset from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip...
✅ MovieLens dataset downloaded and extracted.


In [None]:
# Cell 2 — Load datasets
import pandas as pd, os

def safe_load_csv(path, nrows=None):
    try:
        df = pd.read_csv(path, nrows=nrows, low_memory=False)
        print(f"✅ Loaded {path}: {df.shape}")
        return df
    except Exception as e:
        print(f"❌ Could not load {path}: {e}")
        return None

# MovieLens (guaranteed now)
movies_ratings = safe_load_csv("data/movies/ml-latest-small/ratings.csv")
movies_items   = safe_load_csv("data/movies/ml-latest-small/movies.csv")

# Books
books_items   = safe_load_csv("data/books/Books.csv")
books_ratings = safe_load_csv("data/books/Ratings.csv")

# Anime
anime_items   = safe_load_csv("data/anime/anime.csv")
anime_ratings = safe_load_csv("data/anime/rating.csv")

# Songs
songs_raw = safe_load_csv("data/songs/spotify_millsongdata.csv", nrows=100000)


✅ Loaded data/movies/ml-latest-small/ratings.csv: (100836, 4)
✅ Loaded data/movies/ml-latest-small/movies.csv: (9742, 3)
✅ Loaded data/books/Books.csv: (271360, 8)
✅ Loaded data/books/Ratings.csv: (1149780, 3)
✅ Loaded data/anime/anime.csv: (12294, 7)
✅ Loaded data/anime/rating.csv: (7813737, 3)
✅ Loaded data/songs/spotify_millsongdata.csv: (57650, 4)


In [None]:
# Cell 3 — Normalize column names into consistent schema
def ensure_cols_items(df, item_col_candidates, title_candidates, genre_candidates):
    if df is None:
        return None
    df = df.copy()
    # item id
    for c in item_col_candidates:
        if c in df.columns:
            df = df.rename(columns={c: "itemId"})
            break
    if "itemId" not in df.columns:
        df["itemId"] = range(1, len(df)+1)
    # title
    for c in title_candidates:
        if c in df.columns:
            df = df.rename(columns={c: "title"})
            break
    if "title" not in df.columns:
        df["title"] = df["itemId"].astype(str)
    # genres
    for c in genre_candidates:
        if c in df.columns:
            df = df.rename(columns={c: "genres"})
            break
    if "genres" not in df.columns:
        df["genres"] = ""
    return df

def ensure_cols_ratings(df, user_candidates, item_candidates, rating_candidates):
    if df is None:
        return None
    df = df.copy()
    for c in user_candidates:
        if c in df.columns:
            df = df.rename(columns={c: "userId"})
            break
    if "userId" not in df.columns:
        # If no user ID column found, create a placeholder column (all users are 'default_user')
        df["userId"] = "default_user"

    for c in item_candidates:
        if c in df.columns:
            df = df.rename(columns={c: "itemId"})
            break
    # Ensure itemId is created even if not found in candidates
    if "itemId" not in df.columns and "title" in df.columns:
        df["itemId"] = df["title"]
    elif "itemId" not in df.columns:
        # Fallback to using row index if no itemId or title found
         df["itemId"] = range(1, len(df) + 1)


    for c in rating_candidates:
        if c in df.columns:
            df = df.rename(columns={c: "rating"})
            break
    if "rating" not in df.columns:
        # Default rating if no rating column found
        df["rating"] = 1
    # keep only these columns
    return df[["userId", "itemId", "rating"]]

# Apply for movies
movies_items = ensure_cols_items(movies_items,
    item_col_candidates=["id","movieId","itemId"],
    title_candidates=["title","name"],
    genre_candidates=["genres","genre"]
) if movies_items is not None else None

movies_ratings = ensure_cols_ratings(movies_ratings,
    user_candidates=["userId","user_id","User-ID"],
    item_candidates=["movieId","itemId","id"],
    rating_candidates=["rating"]
)

# Books
books_items = ensure_cols_items(books_items,
    item_col_candidates=["ISBN","book_id","itemId"],
    title_candidates=["Book-Title","title"],
    genre_candidates=["Book-Author","authors","genres"]
) if books_items is not None else None

books_ratings = ensure_cols_ratings(books_ratings,
    user_candidates=["User-ID","userId","user_id"],
    item_candidates=["ISBN","book_id","itemId"],
    rating_candidates=["Book-Rating","rating"]
)

# Anime
anime_items = ensure_cols_items(anime_items,
    item_col_candidates=["anime_id","itemId","id"],
    title_candidates=["name","title"],
    genre_candidates=["genre","genres"]
) if anime_items is not None else None

anime_ratings = ensure_cols_ratings(anime_ratings,
    user_candidates=["user_id","userId"],
    item_candidates=["anime_id","itemId"],
    rating_candidates=["rating"]
)

# Songs
songs_items = ensure_cols_items(songs_raw,
    item_col_candidates=["track_id","id","itemId"],
    title_candidates=["song","track_name","title","name"], # Added 'song'
    genre_candidates=["artist","genres"] # Added 'artist'
) if songs_raw is not None else None

songs_ratings = ensure_cols_ratings(songs_raw,
    user_candidates=["userId","user_id"],
    item_candidates=["track_id","itemId"],
    rating_candidates=["popularity","rating"] # There's no rating in song data, default will be used
)

print("Standardization done. Quick shapes:")
for name, df in [("movies_items", movies_items), ("movies_ratings", movies_ratings),
                 ("books_items", books_items), ("books_ratings", books_ratings),
                 ("anime_items", anime_items), ("anime_ratings", anime_ratings),
                 ("songs_items", songs_items), ("songs_ratings", songs_ratings)]:
    print(f"{name}: {None if df is None else df.shape}")

Standardization done. Quick shapes:
movies_items: (9742, 3)
movies_ratings: (100836, 3)
books_items: (271360, 8)
books_ratings: (1149780, 3)
anime_items: (12294, 7)
anime_ratings: (7813737, 3)
songs_items: (57650, 5)
songs_ratings: (57650, 3)


In [None]:
# Cell 4 — Reduce & sample to memory-safe sizes
import numpy as np

def reduce_and_sample(df, cols, nrows=None):
    if df is None:
        return None
    # keep only requested cols that exist
    present = [c for c in cols if c in df.columns]
    small = df[present].copy()
    if nrows is not None and len(small) > nrows:
        small = small.sample(nrows, random_state=42)
    return small

# Conservative safe sizes (to avoid RAM crashes)
MOVIE_MAX = 50000    # movies ratings
BOOK_MAX  = 50000
ANIME_MAX = 50000    # make this small to keep memory low
SONG_MAX  = 50000

movies_ratings_small = reduce_and_sample(movies_ratings, ["userId","itemId","rating"], nrows=MOVIE_MAX)
movies_items_small   = reduce_and_sample(movies_items, ["itemId","title","genres"], nrows=None)

books_ratings_small  = reduce_and_sample(books_ratings, ["userId","itemId","rating"], nrows=BOOK_MAX)
books_items_small    = reduce_and_sample(books_items, ["itemId","title","genres"], nrows=None)

anime_ratings_small  = reduce_and_sample(anime_ratings, ["userId","itemId","rating"], nrows=ANIME_MAX)
anime_items_small    = reduce_and_sample(anime_items, ["itemId","title","genres"], nrows=None)

songs_ratings_small  = reduce_and_sample(songs_ratings, ["userId","itemId","rating"], nrows=SONG_MAX)
songs_items_small    = reduce_and_sample(songs_items, ["itemId","title","genres"], nrows=None)

print("Reduced shapes:")
print("movies:", movies_ratings_small.shape if movies_ratings_small is not None else None, movies_items_small.shape if movies_items_small is not None else None)
print("books:", books_ratings_small.shape if books_ratings_small is not None else None, books_items_small.shape if books_items_small is not None else None)
print("anime:", anime_ratings_small.shape if anime_ratings_small is not None else None, anime_items_small.shape if anime_items_small is not None else None)
print("songs:", songs_ratings_small.shape if songs_ratings_small is not None else None, songs_items_small.shape if songs_items_small is not None else None)


Reduced shapes:
movies: (50000, 3) (9742, 3)
books: (50000, 3) (271360, 3)
anime: (50000, 3) (12294, 3)
songs: (50000, 3) (57650, 3)


In [None]:
# Cell 5 — Memory-safe recommender
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

class SmartRecommender:
    """
    Memory-safe hybrid recommender:
      - Content: TF-IDF on genres/title
      - Lightweight CF: item co-occurrence via sparse ops (no dense user-user sim matrix)
    """
    def __init__(self, items_df, ratings_df, max_users=5000):
        self.items = items_df.reset_index(drop=True).copy() if items_df is not None else pd.DataFrame(columns=["itemId","title","genres"])
        self.ratings = ratings_df.copy() if ratings_df is not None else pd.DataFrame(columns=["userId","itemId","rating"])
        # ensure types
        self.ratings['userId'] = self.ratings['userId'].astype(str)
        self.ratings['itemId'] = self.ratings['itemId'].astype(str)
        # optionally sample users if too many unique users to save memory
        unique_users = self.ratings['userId'].nunique()
        if unique_users > max_users:
            keep_users = np.random.choice(self.ratings['userId'].unique(), max_users, replace=False)
            self.ratings = self.ratings[self.ratings['userId'].isin(keep_users)]
        # factorize ids to 0..n-1 for sparse matrices
        self.user_map, self.ratings['uidx'] = np.unique(self.ratings['userId'], return_inverse=True)
        self.item_map, self.ratings['iidx'] = np.unique(self.ratings['itemId'], return_inverse=True)
        # create sparse user-item matrix (u x i)
        if len(self.ratings)>0:
            self.R = csr_matrix((self.ratings['rating'].astype(float),
                                 (self.ratings['uidx'], self.ratings['iidx'])),
                                shape=(len(self.user_map), len(self.item_map)))
        else:
            self.R = csr_matrix((0,0))
        # items dataframe: ensure itemId as str to match map
        self.items['itemId'] = self.items['itemId'].astype(str)
        # prepare TF-IDF on genres (fallback to title if genres empty)
        self.items['genres_str'] = self.items['genres'].fillna("").astype(str)
        # if genres empty, use title
        if self.items['genres_str'].str.strip().eq("").all():
            self.items['genres_str'] = self.items['title'].fillna("").astype(str)
        self.tfidf = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", max_features=10000)
        try:
            self.tfidf_matrix = self.tfidf.fit_transform(self.items['genres_str'].values)
        except Exception:
            # fallback to zeros if TFIDF fail
            self.tfidf_matrix = csr_matrix((len(self.items), 1))
        # mapping from itemId -> iidx if present in ratings map, else -1
        self.itemid_to_iidx = {iid: idx for idx, iid in enumerate(self.item_map)}
        # for items not in item_map, we still want index into items df (we will mask later)
        # Pre-normalization flag
        self._norm = True

    def recommend(self, user_id, top_n=10, model='hybrid', genre_pref=None):
        """
        user_id: can be str/int
        model: 'cf', 'content', 'hybrid'
        """
        user_id = str(user_id)
        n_items = len(self.items)
        scores = np.zeros(n_items, dtype=float)
        # CONTENT SCORES
        if model in ('content','hybrid'):
            # find user's rated items within this recommender (by itemId string)
            user_ratings = self.ratings[self.ratings['userId'] == user_id]
            if not user_ratings.empty:
                # map rated iidx to item dataframe indices via item_map -> items index
                rated_iidx = user_ratings['iidx'].values
                # create user profile as mean TF-IDF of rated items that exist in items DF
                valid = [i for i in rated_iidx if i < self.tfidf_matrix.shape[0]]
                if len(valid) > 0:
                    user_profile = self.tfidf_matrix[valid].mean(axis=0)
                    content_scores = (self.tfidf_matrix @ user_profile.T).A1
                    scores += content_scores
            # genre_pref boosting
            if genre_pref:
                for g in genre_pref:
                    g = g.strip().lower()
                    boost = self.items['genres'].fillna("").astype(str).str.lower().str.contains(g).astype(float).values
                    scores += boost * 1.0

        # COLLABORATIVE SCORES (lightweight, item co-occurrence)
        if model in ('cf','hybrid'):
            if user_id in self.user_map:
                uidx = int(np.where(self.user_map == user_id)[0])
                # user vector (1 x items)
                user_vec = self.R.getrow(uidx)  # sparse
                if user_vec.nnz > 0:
                    # Item similarity via co-occurrence: item_score = user_vec * (R.T * R)
                    # But compute efficiently: scores_item = user_vec.dot(R.T).dot(R).toarray().flatten()
                    # Use two sparse multiplications (keeps memory low)
                    try:
                        item_co = user_vec.dot(self.R.T)  # (1 x users)
                        item_scores_vec = item_co.dot(self.R)  # (1 x items)
                        cf_scores = item_scores_vec.toarray().flatten()
                        scores += cf_scores
                    except Exception:
                        pass

        # mask already seen items
        seen_items = set(self.ratings.loc[self.ratings['userId'] == user_id, 'itemId'].astype(str).values)
        out = []
        # compute top indices
        idxs = np.argsort(-scores)[:top_n*5]  # take extra and filter seen
        for idx in idxs:
            if idx < 0 or idx >= n_items:
                continue
            iid = self.items.iloc[idx]['itemId']
            if str(iid) in seen_items:
                continue
            out.append((iid, self.items.iloc[idx]['title'], self.items.iloc[idx]['genres'], float(scores[idx])))
            if len(out) >= top_n:
                break
        return out

    def explain(self, user_id, item_id):
        # Minimal explanation: return placeholder scores — for a proper explanation you'd compute contributions
        return {"cf_score": None, "content_score": None, "similar_rated_items": []}

print("SmartRecommender defined.")


SmartRecommender defined.


In [None]:
# Cell 6 — Initialize recommender instances one-by-one to avoid spikes
RECOMMENDERS = {}

def safe_init(name, items_df, ratings_df, max_users=4000):
    if items_df is None or ratings_df is None:
        print(f"Skipping {name}: items or ratings missing.")
        return None
    try:
        rec = SmartRecommender(items_df, ratings_df, max_users=max_users)
        print(f"{name} initialized.")
        return rec
    except Exception as e:
        print(f"Failed to init {name}: {e}")
        return None

RECOMMENDERS['movies'] = safe_init("Movies", movies_items_small, movies_ratings_small, max_users=3000)
RECOMMENDERS['books']  = safe_init("Books", books_items_small, books_ratings_small, max_users=3000)
RECOMMENDERS['anime']  = safe_init("Anime", anime_items_small, anime_ratings_small.sample(min(len(anime_ratings_small) if anime_ratings_small is not None else 0, 30000), random_state=42) if anime_ratings_small is not None else None, max_users=3000)
RECOMMENDERS['songs']  = safe_init("Songs", songs_items_small, songs_ratings_small.sample(min(len(songs_ratings_small) if songs_ratings_small is not None else 0, 30000), random_state=42) if songs_ratings_small is not None else None, max_users=3000)

print("Initialization attempted for all domains. Check above for any skipped/failed domains.")

Movies initialized.
Books initialized.
Anime initialized.
Songs initialized.
Initialization attempted for all domains. Check above for any skipped/failed domains.


In [None]:
# Cell 7 — Quick local tests (use IDs that exist in your small datasets)
for domain in ['movies','books','anime','songs']:
    rec = RECOMMENDERS.get(domain)
    if rec is None:
        print(domain, "not initialized")
        continue
    # pick a test user (first user in ratings if present)
    if len(rec.ratings) > 0:
        test_user = rec.ratings['userId'].iloc[0]
        print(f"\nTop 5 for {domain}, user {test_user}:")
        print(rec.recommend(test_user, top_n=5))
    else:
        print(domain, "has no ratings in sampled set.")



Top 5 for movies, user 432:
[('34540', 'Pretty Persuasion (2005)', 'Comedy|Drama', 0.4014223215119023), ('1292', 'Being There (1979)', 'Comedy|Drama', 0.4014223215119023), ('1300', 'My Life as a Dog (Mitt liv som hund) (1985)', 'Comedy|Drama', 0.4014223215119023), ('1318', 'Blue Juice (1995)', 'Comedy|Drama', 0.4014223215119023), ('59118', 'Happy-Go-Lucky (2008)', 'Comedy|Drama', 0.4014223215119023)]

Top 5 for books, user 12110:
[('0821745727', 'The Secret Scribbler (Zebra Regency Romance)', 'Cynthia Richey', 1.0), ('1570713383', 'Unstoppable: 45 Powerful Stories of Perseverance and Triumph from People Just Like You', 'Cynthia Kersey', 1.0), ('0671792857', 'TAINTED TRUTH : The Manipulation of Fact in America', 'Cynthia Crossen', 1.0), ('155553046X', 'Machinery of Dominance: Women, Men, and Technical Know-How', 'Cynthia Cockburn', 1.0), ('1567311806', 'The Secret Prophecies of Nostradamus', 'Cynthia Sternau', 1.0)]

Top 5 for anime, user 777:
[('3603', 'JoJo no Kimyou na Bouken: Phant

  uidx = int(np.where(self.user_map == user_id)[0])
  uidx = int(np.where(self.user_map == user_id)[0])
  uidx = int(np.where(self.user_map == user_id)[0])
  uidx = int(np.where(self.user_map == user_id)[0])


In [None]:
 # Cell 8 — Gradio interface (with fixed genre dropdown)
import gradio as gr

# Fixed genre options (you can expand this list as needed)
GENRE_OPTIONS = [
    "Action", "Adventure", "Animation", "Comedy", "Crime",
    "Documentary", "Drama", "Family", "Fantasy", "History",
    "Horror", "Music", "Mystery", "Romance", "Science Fiction",
    "Thriller", "War", "Western"
]

def gradio_recommend(domain, user_id, top_n, model_choice, genre_pref):
    if domain not in RECOMMENDERS or RECOMMENDERS[domain] is None:
        return f"Recommender for {domain} not initialized."
    # genre_pref is already a list of selected values
    recs = RECOMMENDERS[domain].recommend(user_id, top_n=int(top_n), model=model_choice, genre_pref=genre_pref)
    if not recs:
        return "No recommendations — try a different user id or domain."
    # nicely format
    lines = []
    for i, (iid, title, genres, score) in enumerate(recs, 1):
        lines.append(f"{i}. {title}  —  {genres}  —  score: {score:.3f}")
    return "\n".join(lines)

domain_input = gr.Dropdown(
    choices=[k for k in RECOMMENDERS.keys() if RECOMMENDERS[k] is not None],
    label="Domain", value="movies"
)
user_input = gr.Textbox(label="User ID", value="1", placeholder="Enter a user id from dataset (as string or number)")
top_n = gr.Slider(1, 20, value=5, step=1, label="Top N")
model_choice = gr.Dropdown(["hybrid","cf","content"], value="hybrid", label="Model")

# Genre selection dropdown (multi-select allowed)
genre_pref = gr.Dropdown(
    choices=GENRE_OPTIONS,
    multiselect=True,
    label="Preferred Genres"
)

demo = gr.Interface(
    fn=gradio_recommend,
    inputs=[domain_input, user_input, top_n, model_choice, genre_pref],
    outputs=gr.Textbox(label="Recommendations"),
    title="Multi-Domain Smart Recommender",
    description="Choose domain and user to get recommendations (memory-safe)."
)

demo.launch(share=False)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

