<a href="https://colab.research.google.com/github/0xflame-7/deepdata-archive/blob/main/FilmFlareContentBasedFilter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
file_path = "https://raw.githubusercontent.com/0xflame-7/Film-Flare/refs/heads/main/server/src/data/datasets/clean_data.csv"

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv(file_path, usecols=['movieId', 'original_title', 'genres', 'actors', 'directors', 'overview'])

In [4]:
df.head(1)

Unnamed: 0,movieId,original_title,genres,actors,directors,overview
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,"Led by Woody, Andy's toys live happily in his ..."


In [5]:
df['genres'] = df['genres'].apply(lambda x: x.split('|'))
df['actors'] = df['actors'].apply(lambda x: x.split('|'))
df['directors'] = df['directors'].apply(lambda x: x.split('|'))

In [6]:
df.head(1)

Unnamed: 0,movieId,original_title,genres,actors,directors,overview
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter],"Led by Woody, Andy's toys live happily in his ..."


In [7]:
df['actors'] = df['actors'].apply(lambda x: [i.replace(" ", "") for i in x])
df['directors'] = df['directors'].apply(lambda x: [i.replace(" ", "") for i in x])

In [8]:
df.head(1)

Unnamed: 0,movieId,original_title,genres,actors,directors,overview
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]","[TomHanks, TimAllen, DonRickles, JimVarney, Wa...",[JohnLasseter],"Led by Woody, Andy's toys live happily in his ..."


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer(stop_words='english')

In [10]:
genre_matrix = vector.fit_transform(df['genres'].apply(lambda x: ' '.join(x)))
actor_matrix = vector.fit_transform(df['actors'].apply(lambda x: ' '.join(x)))
director_matrix = vector.fit_transform(df['directors'].apply(lambda x: ' '.join(x)))

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_matrix)
actor_sim = cosine_similarity(actor_matrix)
director_sim = cosine_similarity(director_matrix)

In [12]:
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
import torch

In [13]:
tokerizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def roberta_embed(text):
  inputs = tokerizer(text, return_tensors='pt', truncation=True, max_length=128,padding=True)
  with torch.no_grad():
    outputs = model(**inputs)
  return outputs.last_hidden_state.mean(dim=1).numpy()

tqdm.pandas()
df['overview_embs'] = df['overview'].progress_apply(roberta_embed)

100%|██████████| 9552/9552 [39:58<00:00,  3.98it/s]


In [15]:
import numpy as np

overview_embs = np.vstack(df['overview_embs'].values)
overview_sim = cosine_similarity(overview_embs)

In [16]:
w_genre = 0.30
w_actor = 0.20
w_director = 0.15
w_overview = 0.35

In [17]:
final_sim = (w_genre * genre_sim) + (w_actor * actor_sim) + (w_director * director_sim) + (w_overview * overview_sim)

In [22]:
def recommend_movies(title, top_n=5):
    if title not in df['original_title'].values:
        return f"Movie '{title}' not found!"

    idx = df[df['original_title'] == title].index[0]
    sim_scores = list(enumerate(final_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    indices = [i[0] for i in sim_scores]
    return df.iloc[indices][['original_title']]

In [25]:
recommend_movies('Avengers: Age of Ultron')

Unnamed: 0,original_title
7618,The Avengers
8581,Avengers: Infinity War
5914,Serenity
8582,Thor: Ragnarok
8294,Captain America: The Winter Soldier


In [26]:
import pickle

model_data = {
    "dataframe": df,
    "similarity_matrix": final_sim
}

with open("/content/hybrid_content_model.pkl", "wb") as f:
    pickle.dump(model_data, f)

print("✅ Model saved as hybrid_content_model.pkl")

✅ Model saved as hybrid_content_model.pkl
