In [None]:
!pip install surprise


Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2461563 sha256=d579779bbd232f70f6fc0c2acdc68ae01c4b02883af98472fcc0596f78f8b523
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installi

In [None]:
!pip install numpy==1.24.4 --force-reinstall


Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m97.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.4 which is incompatible.
pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.4 which is incompatible.
jaxlib 0.5.1 requires 

In [None]:
# 1) Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split as surprise_split
from surprise.accuracy import rmse, mae
from collections import defaultdict
import ipywidgets as widgets
from IPython.display import display

# 2) Load & preprocess
df = pd.read_csv("movielens_post2010_only_with_tags.csv")
df['userId']  = df['userId'].astype(int)
df['movieId'] = df['movieId'].astype(int)
df['rating']  = df['rating'].astype(float)
df['genres']  = df['genres'].fillna('')
df['tag']     = df['tag'].fillna('')
df['year']    = df['year'].fillna(df['year'].median()).astype(int)

# 3) Prepare train/test for CF (Surprise)
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
trainset, testset = surprise_split(data, test_size=0.3, random_state=42)

# 4) SVD Collaborative Filtering (Surprise)
svd = SVD()
svd.fit(trainset)
predictions = svd.test(testset)
print(f"CF (SVD) → RMSE: {rmse(predictions):.4f}, MAE: {mae(predictions):.4f}")

# 5) Improved Content-Based Filtering (TF-IDF with rating-weighted profiles)
movies = df.drop_duplicates('movieId').reset_index(drop=True)
movies['combined'] = movies['genres'] + " " + movies['tag'] + " " + movies['year'].astype(str)
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=3)
tfidf_matrix = tfidf.fit_transform(movies['combined'])
mid_to_idx = {mid:i for i,mid in enumerate(movies['movieId'])}

# User history
df_unique = df.drop_duplicates(['userId','movieId'])
train_df = pd.DataFrame(trainset.build_testset(), columns=['userId','movieId','rating'])
test_df  = pd.DataFrame(testset, columns=['userId','movieId','rating'])

holdout = defaultdict(set)
hist    = defaultdict(list)
for u, m, _ in test_df.itertuples(index=False):
    holdout[u].add(m)
for u, m, r in train_df.itertuples(index=False):
    if m in mid_to_idx:
        hist[u].append((mid_to_idx[m], r))

def precision_at_k(actual, preds, k=10):
    return len(set(preds[:k]) & actual) / k

precisions = []
for u, actuals in holdout.items():
    if not hist[u]: continue
    # Weighted profile
    profile = np.zeros(tfidf_matrix.shape[1])
    total_rating = sum(r for _, r in hist[u])
    for idx, rating in hist[u]:
        profile += (rating / total_rating) * tfidf_matrix[idx].toarray().flatten()
    profile = profile.reshape(1, -1)
    sims = cosine_similarity(profile, tfidf_matrix).flatten()
    unseen = [(sims[i], movies.at[i,'movieId']) for i in range(len(movies)) if movies.at[i,'movieId'] not in dict(hist[u])]
    unseen = list(dict.fromkeys(unseen))
    unseen = [item for item in unseen if item[0] > 0.1]  # filter low similarity
    unseen.sort(reverse=True)
    recs = [mid for _,mid in unseen[:10]]
    precisions.append(precision_at_k(actuals, recs))

print(f"Content-Based → Mean Precision@10: {np.mean(precisions):.4f}\n")





RMSE: 0.3527
MAE:  0.1938
CF (SVD) → RMSE: 0.3527, MAE: 0.1938
Content-Based → Mean Precision@10: 0.1202



In [None]:
# 6) Interactive Hybrid Recommender
genres = sorted(set(g for g in '|'.join(df['genres']).split('|') if g))
all_genres_lower = set(g.lower() for g in genres)
top_tags_raw = df['tag'].dropna().str.lower().value_counts().index.tolist()
top_tags = [tag for tag in top_tags_raw if tag not in all_genres_lower][:20]

# Widgets
genre_select = widgets.SelectMultiple(options=genres, description='Genres', rows=10)
tag_select = widgets.SelectMultiple(options=top_tags, description='Tags', rows=10)
min_year = int(df['year'].min())
max_year = int(df['year'].max())
year_range = widgets.IntRangeSlider(value=[2010, 2023], min=min_year, max=max_year, step=1, description='Years:', continuous_update=False)
generate_button = widgets.Button(description="Get Recommendations")
output = widgets.Output()
display(genre_select, tag_select, year_range, generate_button, output)

# Button click handler
def on_generate_click(b):
    with output:
        output.clear_output()
        selected_genres = list(genre_select.value)
        selected_tags = list(tag_select.value)
        year_min, year_max = year_range.value

        print("Fetching recommendations...\n")

        filtered = df.copy()
        if selected_genres:
            filtered = filtered[filtered['genres'].str.contains('|'.join(selected_genres))]
        if selected_tags:
            filtered = filtered[filtered['tag'].str.lower().isin(selected_tags)]
        filtered = filtered[(filtered['year'] >= year_min) & (filtered['year'] <= year_max)]
        filtered = filtered.drop_duplicates('movieId')

        if filtered.empty:
            print("No movies found matching your filters.")
            return

        user_id = 78213  # Can be dynamic later

        # Get all movies user has rated in training data
        user_rated = df[(df['userId'] == user_id) & (df['movieId'].isin(mid_to_idx.keys()))]
        if user_rated.empty:
            print("No rating history found for this user.")
            return

        # Build content profile from past ratings
        user_hist = user_rated[['movieId', 'rating']].values
        profile = np.zeros(tfidf_matrix.shape[1])
        total_rating = sum(r for _, r in user_hist)
        for mid, r in user_hist:
            idx = mid_to_idx.get(mid)
            if idx is not None:
                profile += (r / total_rating) * tfidf_matrix[idx].toarray().flatten()
        profile = profile.reshape(1, -1)

        # Filter out movies already rated by user
        seen_movie_ids = set(user_rated['movieId'])
        filtered = filtered[~filtered['movieId'].isin(seen_movie_ids)]

        if filtered.empty:
            print("You have already rated all movies in this filter.")
            return

        # Compute SVD and content scores
        filtered['svd_score'] = filtered['movieId'].apply(lambda x: svd.predict(user_id, x).est)
        filtered['content_score'] = filtered['movieId'].apply(
            lambda x: cosine_similarity(profile, tfidf_matrix[mid_to_idx[x]]).flatten()[0] if x in mid_to_idx else 0
        )

        # Hybrid scoring
        filtered['hybrid_score'] = 0.6 * filtered['svd_score'] + 0.4 * filtered['content_score']
        filtered = filtered.sort_values(by='hybrid_score', ascending=False)

        if filtered.empty:
            print("No recommendations found after scoring.")
            return

        print("Top 10 recommendations:\n")
        print(filtered[['title', 'genres', 'tag', 'year']].head(10).to_string(index=False))
        print("\n")

generate_button.on_click(on_generate_click)




SelectMultiple(description='Genres', options=('(no genres listed)', 'Action', 'Adventure', 'Animation', 'Child…

SelectMultiple(description='Tags', options=('visually appealing', 'funny', 'cinematography', 'atmospheric', 'p…

IntRangeSlider(value=(2010, 2023), continuous_update=False, description='Years:', max=2023, min=2010)

Button(description='Get Recommendations', style=ButtonStyle())

Output()

In [None]:
df['userId'].value_counts().head(10)


Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
78213,88384
17035,10908
159300,10600
119227,9763
147560,6912
34874,6798
34458,4965
6324,4793
151456,4469
144253,3891


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 530025 entries, 0 to 530024
Data columns (total 9 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   530025 non-null  int64  
 1   movieId  530025 non-null  int64  
 2   rating   530025 non-null  float64
 3   title    530025 non-null  object 
 4   genres   530025 non-null  object 
 5   year     530025 non-null  int64  
 6   tag      530025 non-null  object 
 7   imdbId   530025 non-null  int64  
 8   tmdbId   529976 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 36.4+ MB


In [None]:

from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse, mae
import time
from surprise import SVD
# Load data
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.3, random_state=42)

# Train SVD++
model = SVD(n_factors=50, n_epochs=10)
start = time.time()
model.fit(trainset)
print("Training time (s):", time.time() - start)


# Evaluate
predictions = model.test(testset)
print(f"SVD++ → RMSE: {rmse(predictions):.4f}, MAE: {mae(predictions):.4f}")


Training time (s): 2.9471585750579834
RMSE: 0.4826
MAE:  0.3026
SVD++ → RMSE: 0.4826, MAE: 0.3026
