<a href="https://colab.research.google.com/github/Aya11ali/Shouf/blob/main/Hybrid_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
# !pip -q install camel-tools

In [52]:
# !pip install numpy==1.23.5

In [53]:
# !pip install -q gensim

In [1]:
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
from typing import List, Iterable, Dict, Union,Tuple

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

import os
import scipy
import gensim
import gensim.downloader as api
import torch
from gensim.models import KeyedVectors


import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.stem import SnowballStemmer
# from camel_tools.tokenizers.word import simple_word_tokenize as arabic_tokenizer
# from camel_tools.morphology.analyzer import Analyzer

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

import joblib
import logging

In [2]:
import warnings

warnings.filterwarnings("ignore")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# **Collaborative Filtering Recommendation System**

In [3]:
class ILoad_and_shuffle_Dataset(ABC):
  @abstractmethod
  def load_dataset(self):
    pass

  @abstractmethod
  def shuffle_data(self):
    pass

class Load_and_shuffle_Dataset(ILoad_and_shuffle_Dataset):
  def __init__(self,dataset_path:str,random_seed:int=42):
    self.random_seed = random_seed
    np.random.seed(self.random_seed) # Set seed for NumPy's random number generator to ensure reproducible results
    self.dataset_path = dataset_path
    self.df = None
    self.load_dataset()
    self.shuffle_data()


  def load_dataset(self):
    self.df = pd.read_csv(self.dataset_path)

  def shuffle_data(self):
    self.df = self.df.sample(frac=1, random_state=self.random_seed).reset_index(drop=True)

The following module defines a structure for calculating interaction scores in a dataset based on different user actions (e.g., "watch", "like", "comment", "subscribe"). It uses a column-matching strategy to identify relevant columns, applies weights to interaction types, and computes a final score for each record.

In [4]:
class IInteractionScorer(ABC):
    @abstractmethod
    def compute_interaction_score(self):
        pass


class ColumnMatcher(ABC):
    @abstractmethod
    def match_columns(self, df_columns, interaction_type):
        pass


class SubstringMatcher(ColumnMatcher):
    def match_columns(self, df_columns, interaction_type=None):
        if interaction_type is None:
            raise ValueError("interaction_type must be provided")
        return [col for col in df_columns if interaction_type in col]


class InteractionScorer(IInteractionScorer):
    def __init__(self, dataframe, weights:dict=None, matcher: ColumnMatcher = SubstringMatcher):
        self.df = dataframe
        self.weights = weights or {
            "watch": 1,
            "like": 2,
            "comment": 3,
            "subscribe": 5
        }
        self.matcher = matcher()
        self.compute_interaction_score()

    def compute_interaction_score(self):
        interaction_score = 0
        for interaction_type, weight in self.weights.items():
            matched_cols = self.matcher.match_columns(self.df.columns, interaction_type)
            for col in matched_cols:
                interaction_score += self.df[col] * weight
        self.df["interaction_score"] = interaction_score


In [5]:
class DataPreparingManager:
  def __init__(self, loader:ILoad_and_shuffle_Dataset, dataset_path:str,score:IInteractionScorer):
    self.loader = loader(dataset_path)
    self.df = self.loader.df
    self.interaction_score = score(self.df)

  def prepare_data(self):
    return self.df

In [7]:
class ISparseMatrixConverter(ABC):
    @abstractmethod
    def create_sparse_matrix(self)  -> csr_matrix:
        pass

class SparseMatrixConverter(ISparseMatrixConverter):
    def __init__(self, dense_matrix: pd.DataFrame):
        self.df = dense_matrix
        self._sparse_matrix = None

    def create_sparse_matrix(self)  -> csr_matrix:
        if self._sparse_matrix is None:
            self._sparse_matrix = csr_matrix(self.df.values)
        return self._sparse_matrix

    @property
    def sparse_matrix(self):
        return self.create_sparse_matrix()

In [9]:
class IKNNModelTrainer(ABC):
    @abstractmethod
    def train_model(self) -> NearestNeighbors:
        pass

class KNNModelTrainer(IKNNModelTrainer):
    def __init__(self, sparse_matrix: csr_matrix, metric: str = "cosine",
                 algorithm: str = "brute"):
        self.sparse_matrix = sparse_matrix
        self.metric = metric
        self.algorithm = algorithm
        self._model = None

    def train_model(self) -> NearestNeighbors:
      try:
          if self._model is None:
              self._model = NearestNeighbors(metric=self.metric,
                                             algorithm=self.algorithm)
              self._model.fit(self.sparse_matrix)

      except Exception as e:
          raise TrainingError(f"Model training failed: {e}")

      return self._model

    @property
    def model(self) -> NearestNeighbors:
        return self.train_model()


## User Based Filter

The following  module defines an abstract interface and a concrete implementation for creating a ***User-Item interaction*** matrix from a given dataset.

`UserItemMatrix` (Concrete Implementation)


Implements `IUserItemMatrix` to generate a matrix from a DataFrame using specified rows, columns, and values.

**Constructor Parameters:**

`dataframe` (pd.DataFrame): Input dataset.

`row` (str): Column to use as index (e.g., user_id).

`column` (str): Column to use as columns (e.g., video_id).

`values` (str): Column containing interaction values (e.g., interaction_score).

In [8]:
class IUserItemMatrix(ABC):
  @abstractmethod
  def create_user_item_matrix(self)->pd.DataFrame:
    pass

class UserItemMatrix(IUserItemMatrix):
  def __init__(self, dataframe:pd.DataFrame, row:str, column:str, values:str):
    self.df = dataframe
    self.row = row
    self.column = column
    self.values = values

  def create_user_item_matrix(self)->pd.DataFrame:
    if self.row not in self.df.columns or self.column not in self.df.columns:
        raise InvalidMatrixError(f"Invalid columns: {self.row} or {self.column} not found in the dataframe.")

    return self.df.pivot(index=self.row, columns=self.column, values=self.values).fillna(0)

  @property
  def matrix(self) -> pd.DataFrame:
      return self.create_user_item_matrix()

The following class Validates whether a given user_id exists in the user-item interaction matrix, ensuring the user has interacted with the system and that the matrix is properly initialized.

In [10]:
class IUserInteractionChecker(ABC):
    @abstractmethod
    def check_user_interaction(self, user_id: int, user_item_matrix: pd.DataFrame) -> bool:
        pass

class UserInteractionChecker(IUserInteractionChecker):
    def check_user_interaction(self, user_id: int, user_item_matrix: pd.DataFrame) -> bool:
        if user_item_matrix is None or user_item_matrix.empty:
            raise InvalidMatrixError("User-Item matrix is empty or not initialized.")
        if user_id not in user_item_matrix.index:
            raise InvalidUserIdError(user_id)
        return True


In [11]:
class IUserIndexFinder(ABC):
    @abstractmethod
    def _find_user_index(self) -> int:
        """Finds the index of the user in the user-item matrix."""
        pass

    @property
    @abstractmethod
    def index(self):
        pass

class UserIndexFinder(IUserIndexFinder):
    def __init__(self, user_id: int, user_item_matrix: pd.DataFrame, user_check: IUserInteractionChecker):
        self.user_id = user_id
        self.user_item_matrix = user_item_matrix
        self.user_check = user_check

    def _find_user_index(self) -> int:
        # Check user interaction when needed
        self.user_check.check_user_interaction(self.user_id, self.user_item_matrix)

        return self.user_item_matrix.index.get_loc(self.user_id)

    @property
    def index(self) -> int:
        return self._find_user_index()


In [22]:
class ISimilarUserFinder(ABC):
  @abstractmethod
  def find_similar_users(self, n_neighbors: int = 5) -> np.ndarray:
      """Returns indices of similar users for a given user ID."""
      pass

  @property
  @abstractmethod
  def similar_users(self) -> np.ndarray:
      """Returns cached similar users."""
      pass

class SimilarUserFinder(ISimilarUserFinder):
  def __init__(self, user_id: int, knn_model, user_item_matrix: pd.DataFrame,
              user_index: IUserIndexFinder, user_check: IUserInteractionChecker):
      self.user_id = user_id
      self.knn_model = knn_model
      self.user_item_matrix = user_item_matrix
      self.user_check = user_check
      self.user_index = user_index
      self._similar_users = None

      self.user_check.check_user_interaction(self.user_id, self.user_item_matrix)
      self._user_index = self.user_index.index

  def find_similar_users(self, n_neighbors: int = 5) -> np.ndarray:

      if self.knn_model is None:
          raise RecommendationSystemError("KNN model is not provided.")

      # Get the user vector
      user_vector = self.user_item_matrix.iloc[self._user_index].values.reshape(1, -1)

      # Find neighbors
      distances, indices = self.knn_model.kneighbors(user_vector,
                                                     n_neighbors=n_neighbors + 1)

      # Exclude the first index (it will be the user itself)
      self._similar_users = indices[0][1:]
      return self._similar_users

  @property
  def similar_users(self) -> np.ndarray:
    if self._similar_users is None:
        return self.find_similar_users()
    return self._similar_users

The following class provides user-based collaborative filtering recommendations. The class uses the similar users' interactions with items to recommend items to a given user, excluding items the user has already interacted with.

In [23]:
class IUserBasedRecommender(ABC):
    @abstractmethod
    def recommend(self, top_n: int = 5) -> List[int]:
        """Returns a list of item IDs recommended for the user."""
        pass

class UserBasedRecommender(IUserBasedRecommender):
    def __init__(self, user_id: int, user_item_matrix: pd.DataFrame,
                 similar_user_finder: ISimilarUserFinder):
        self.user_id = user_id
        self.user_item_matrix = user_item_matrix
        self.similar_user_finder = similar_user_finder

    def recommend(self, top_n: int = 5) -> List[int]:
        similar_user_indices = self.similar_user_finder.similar_users

        similar_users_matrix = self.user_item_matrix.iloc[similar_user_indices]

        item_scores = similar_users_matrix.sum(axis=0)

        user_interactions = self.user_item_matrix.loc[self.user_id]

        unseen_items = item_scores[user_interactions == 0]

        top_items = unseen_items.sort_values(ascending=False).head(top_n)

        return [(item, score) for item, score in top_items.items()]


## Item based recommendation system  

In [24]:
class IItemSimilarityFinder(ABC):
    @abstractmethod
    def find_similar_items(self, n_neighbors: int = 5) -> np.ndarray:
      """Finds and returns similar items to the given item ID."""
      pass

class ItemSimilarityFinder(IItemSimilarityFinder):
    def __init__(self, item_id: int, item_user_matrix: pd.DataFrame, knn_model: NearestNeighbors):
        self.item_user_matrix = item_user_matrix
        self.knn_model = knn_model
        self.item_id = item_id
        self._similar_videos = None

    def find_similar_items(self, item_id: int, n_neighbors: int = 5):
        if item_id not in self.item_user_matrix.index:
            raise InvalidVideoIdError(item_id)

        item_vector = self.item_user_matrix.loc[item_id].values.reshape(1, -1)
        distances, indices = self.knn_model.kneighbors(item_vector, n_neighbors=n_neighbors + 1)

        similar_items = self.item_user_matrix.index[indices[0][1:]].tolist()
        similarities = distances[0][1:].tolist()

        return similar_items, similarities

    @property
    def similar_videos(self) -> np.ndarray:
        if self._similar_videos is None:
            self._similar_videos = self.find_similar_items(self.item_id)
        return self._similar_videos


In [26]:
class IUserInteractionFetcher(ABC):
    @abstractmethod
    def get_interacted_items(self, user_id: int) -> List[int]:
        pass

class UserInteractionFetcher(IUserInteractionFetcher):
    def __init__(self, user_item_matrix: pd.DataFrame, user_checker: IUserInteractionChecker):
        self.user_item_matrix = user_item_matrix
        self.user_checker = user_checker

    def get_interacted_items(self, user_id: int) -> List[int]:
        self.user_checker.check_user_interaction(user_id, self.user_item_matrix)

        user_interactions = self.user_item_matrix.loc[user_id]
        return user_interactions[user_interactions > 0].index.tolist()

In [27]:
class IItemScoreAccumulator(ABC):
    @abstractmethod
    def reset(self) -> None:
        pass

    @abstractmethod
    def accumulate(self, similar_items: Iterable[int], interacted_items: Iterable[int]) -> None:
        pass

    @abstractmethod
    def get_scores(self) -> Dict[int, float]:
        pass

class ItemScoreAccumulator(IItemScoreAccumulator):
    def __init__(self):
        self._scores = {}

    def reset(self) -> None:
        self._scores.clear()


    def accumulate(self, similar_items: Iterable[int], interacted_items: Iterable[int], similarities: Iterable[float]) -> None:
        for item, similarity in zip(similar_items, similarities):
            if item not in interacted_items:
                self._scores[item] = self._scores.get(item, 0) + similarity

    def get_scores(self) -> Dict[int, float]:
        return self._scores


In [28]:
class IRecommendationRanker(ABC):
    @abstractmethod
    def rank(self, item_scores: Dict[int, float], top_n: int) -> List[int]:
        pass

class RecommendationRanker(IRecommendationRanker):
    def rank(self, item_scores: Dict[int, float], top_n: int) -> List[int]:
        sorted_items = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)
        return [item for item, _ in sorted_items[:top_n]]

In [29]:
class ItemBasedRecommender:
    def __init__(self, user_id: int, user_item_matrix: pd.DataFrame,
                 similarity_finder: IItemSimilarityFinder, score_accumulator: IItemScoreAccumulator,
                 interaction_fetcher: IUserInteractionFetcher, ranker: IRecommendationRanker):
        self.user_id = user_id
        self.user_item_matrix = user_item_matrix
        self.similarity_finder = similarity_finder
        self.score_accumulator = score_accumulator
        self.interaction_fetcher = interaction_fetcher
        self.ranker = ranker

    def recommend(self, top_n: int = 5) -> List[Tuple[int, float]]:
        if self.user_id not in self.user_item_matrix.index:
            raise ValueError(f"User ID {self.user_id} not found in matrix.")

        interacted_items = self.interaction_fetcher.get_interacted_items(self.user_id)
        self.score_accumulator.reset()

        for item in interacted_items:
            similar_items, similarities = self.similarity_finder.find_similar_items(item)
            self.score_accumulator.accumulate(similar_items, interacted_items, similarities)

        item_scores = self.score_accumulator.get_scores()
        ranked_items = self.ranker.rank(item_scores, top_n)
        recommended_items = [(item, item_scores.get(item, 0)) for item in ranked_items]
        return recommended_items


In [30]:
class ItemSimilarityAdapter(IItemSimilarityFinder):
    def __init__(self, item_user_matrix, knn_model):
        self.item_user_matrix = item_user_matrix
        self.knn_model = knn_model

    def find_similar_items(self, item_id: int, n_neighbors: int = 5):
        if item_id not in self.item_user_matrix.index:
            raise InvalidVideoIdError(item_id)

        item_vector = self.item_user_matrix.loc[item_id].values.reshape(1, -1)
        distances, indices = self.knn_model.kneighbors(item_vector, n_neighbors=n_neighbors + 1)

        similar_items = self.item_user_matrix.index[indices[0][1:]].tolist()
        similarities = distances[0][1:].tolist()
        return similar_items, similarities


## Collaborative Filter

In [31]:
def get_collaborative_filter(Dataset_path: str, target_user_id: int,top_n:int):

    df = DataPreparingManager(Load_and_shuffle_Dataset, Dataset_path, InteractionScorer)
    df = df.prepare_data()

    user_item_builder = UserItemMatrix(df, row="user_id", column="video_id", values="interaction_score")
    user_item_matrix = user_item_builder.matrix
    item_user_matrix = user_item_matrix.T

    user_sparse_converter = SparseMatrixConverter(user_item_matrix)
    user_sparse_matrix = user_sparse_converter.sparse_matrix
    item_sparse_converter = SparseMatrixConverter(item_user_matrix)
    item_sparse_matrix = item_sparse_converter.sparse_matrix

    user_knn_trainer = KNNModelTrainer(sparse_matrix=user_sparse_matrix, metric='cosine', algorithm='brute')
    user_knn_model = user_knn_trainer.model
    item_knn_trainer = KNNModelTrainer(sparse_matrix=item_sparse_matrix, metric='cosine', algorithm='brute')
    item_knn_model = item_knn_trainer.model

    user_checker = UserInteractionChecker()
    user_index_finder = UserIndexFinder(user_id=target_user_id,
                                        user_item_matrix=user_item_matrix,
                                        user_check=user_checker)

    similar_user_finder = SimilarUserFinder(user_id=target_user_id,
                                            knn_model=user_knn_model,
                                            user_item_matrix=user_item_matrix,
                                            user_index=user_index_finder,
                                            user_check=user_checker)

    user_recommender = UserBasedRecommender(user_id=target_user_id,
                                            user_item_matrix=user_item_matrix,
                                            similar_user_finder=similar_user_finder)

    user_recommendations = user_recommender.recommend(top_n)

    user_recs_dict = {rec[0]: rec[1] for rec in user_recommendations}

    interaction_checker = UserInteractionChecker()
    interaction_fetcher = UserInteractionFetcher(user_item_matrix=user_item_matrix,
                                                 user_checker=interaction_checker)
    score_accumulator = ItemScoreAccumulator()
    ranker = RecommendationRanker()

    similarity_finder = ItemSimilarityAdapter(item_user_matrix, item_knn_model)

    item_recommender = ItemBasedRecommender(user_id=target_user_id,
                                            user_item_matrix=user_item_matrix,
                                            similarity_finder=similarity_finder,
                                            score_accumulator=score_accumulator,
                                            interaction_fetcher=interaction_fetcher,
                                            ranker=ranker)

    item_recommendations = item_recommender.recommend(top_n)

    item_recs_dict = {rec[0]: rec[1] for rec in item_recommendations}

    return user_recs_dict, item_recs_dict


# **Content Filter**

In [32]:
class DataFramePreprocessor:
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def drop_columns(self, columns: List[str]):
        self.df = self.df.drop(columns=columns, axis=1)
        return self

    def remove_duplicates(self, subset: List[str] = ["video_id"]):
        self.df = self.df.drop_duplicates(subset=subset)
        return self

    def get(self):
        return self.df

In [33]:
class TextPreprocessor(ABC):
  @abstractmethod
  def _clean_text(self):
      pass

  @abstractmethod
  def _tokenize(self):
      pass

  @abstractmethod
  def text_processor(self, text):
      pass

class EnglishTextPreprocessor(TextPreprocessor):
  def __init__(self):
      self.stopwords = set(stopwords.words('english'))
      self.stemmer = SnowballStemmer('english')
      self.tokens = []
      self.text = ""

  def _clean_text(self):
      """Converts text to lowercase and tokenizes it."""
      self.tokens = word_tokenize(self.text.lower())

  def _tokenize(self):
      """Removes stopwords and punctuation and applies stemming."""
      self.tokens = [
          self.stemmer.stem(word)
          for word in self.tokens
          if word not in self.stopwords and word not in string.punctuation
      ]
      return self.tokens

  def text_processor(self, text):
      """Process the text: clean and tokenize."""
      self.text = text
      self._clean_text()
      return self._tokenize()

class ArabicTextPreprocessor(TextPreprocessor):
  def __init__(self):
      self.stopwords = set(stopwords.words('arabic'))
      self.stemmer = CamelStemmer()
      self.tokens = []
      self.text = ""

  def _clean_text(self):
      """Tokenizes Arabic text."""
      self.tokens = arabic_tokenizer(self.text)

  def _tokenize(self):
      """Removes stopwords and punctuation and applies stemming."""
      self.tokens = [
          self.stemmer.stem(word)
          for word in self.tokens
          if word not in self.stopwords and word not in string.punctuation
      ]
      return self.tokens

  def text_processor(self, text):
      """Process the text: clean and tokenize."""
      self.text = text
      self._clean_text()
      return self._tokenize()

In [35]:
class DataFrameTextProcessor:
    def __init__(self, preprocessor: TextPreprocessor):
        self.preprocessor = preprocessor

    def process(self, df: pd.DataFrame, title_col: str, genre_col: str, new_col_name: str = "tokenized_text") -> pd.DataFrame:
        df = df.copy()
        df["combined_text"] = df[title_col] + " " + df[genre_col]
        df[new_col_name] = df["combined_text"].apply(self.preprocessor.text_processor)
        return df

In [36]:
class Word2VecManager:

    @staticmethod
    def download_pretrained(name: str = "word2vec-google-news-300"):
        return api.load(name)

    @staticmethod
    def save_model(model, path: str):
        model.save(path)

    @staticmethod
    def load_model(path: str):
        return KeyedVectors.load(path)

    @staticmethod
    def load_gensim_binary(path: str):
        return KeyedVectors.load_word2vec_format(path, binary=True)


In [37]:
def compute_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return tfidf_matrix, vectorizer.vocabulary_

In [38]:
def build_corpus_from_tokens(token_column):
    return token_column.apply(lambda tokens: " ".join(tokens))

In [39]:
def get_weighted_word2vec(tokens, index, model, tfidf_matrix, tfidf_vocab, vector_size=300):
    vector = np.zeros(vector_size)
    weight_sum = 0

    for word in tokens:
        if word not in model or word not in tfidf_vocab:
            continue
        weight = tfidf_matrix[index, tfidf_vocab[word]]
        vector += model[word] * weight
        weight_sum += weight

    return vector / weight_sum if weight_sum > 0 else vector


In [40]:
def compute_video_vectors(df, model, tfidf_matrix, tfidf_vocab):
    df = df.reset_index(drop=True)
    df["video_vector"] = df.apply(
        lambda row: get_weighted_word2vec(
            row["tokenized_text"],
            row.name,
            model,
            tfidf_matrix,
            tfidf_vocab
        ),
        axis=1
    )
    return df

In [44]:
def compute_cosine_similarity(vectors):
    return cosine_similarity(np.vstack(vectors))

In [41]:
def recommend_similar_videos(video_indices, df, similarity_matrix, top_n=5):
    recommendations = []
    for video_index in video_indices:
        similarities = similarity_matrix[video_index]
        similar_indices = np.argsort(similarities)[::-1]
        top_similar = [i for i in similar_indices if i != video_index and i not in video_indices][:top_n]

        recommendations.extend([
            {
                "title": df.iloc[i]["video_title"],
                "genre": df.iloc[i]["video_genre"],
                "score": round(float(similarities[i]), 4)
            }
            for i in top_similar
        ])

    return recommendations


In [45]:
def get_content_based_recommendations(csv_path: str, watched_video_indices: List[int], top_n: int ):
    df = pd.read_csv(csv_path)
    df_preprocessed = (
        DataFramePreprocessor(df)
        .drop_columns(['user_id', 'watched', 'liked', 'commented', 'subscribed'])
        .remove_duplicates(subset=["video_title", "video_genre"])
        .get()
    )

    text_preprocessor = EnglishTextPreprocessor()
    text_processor = DataFrameTextProcessor(text_preprocessor)
    df_text_processed = text_processor.process(df_preprocessed,
                                               title_col="video_title",
                                               genre_col="video_genre"
                                               )

    corpus = build_corpus_from_tokens(df_text_processed["tokenized_text"])
    tfidf_matrix, tfidf_vocab = compute_tfidf(corpus)

    word2vec = Word2VecManager()
    if not os.path.exists("word2vec_model.bin"):
        model = word2vec.download_pretrained()
        word2vec.save_model(model, "word2vec_model.bin")
    else:
        model = word2vec.load_model("word2vec_model.bin")

    df_vectors = compute_video_vectors(df_text_processed, model, tfidf_matrix, tfidf_vocab)
    similarity_matrix = compute_cosine_similarity(df_vectors["video_vector"].tolist())

    agg_similarity = np.zeros(similarity_matrix.shape[0])
    for idx in watched_video_indices:
        agg_similarity += similarity_matrix[idx]

    for idx in watched_video_indices:
        agg_similarity[idx] = -1

    recommended_indices = np.argsort(agg_similarity)[::-1][:top_n]

    recommendations = [
        {
            "title": df_vectors.iloc[i]["video_title"],
            "genre": df_vectors.iloc[i]["video_genre"],
            "score": round(float(agg_similarity[i]), 4)
        }
        for i in recommended_indices
    ]

    return recommendations


In [49]:
import json

def get_hybrid_recommendations(data_set_path: str,
                                user_id: int,
                                user_weight=0.4,
                                item_weight=0.4,
                                content_weight=0.2,
                                k=10,
                                n_recommendations=10):

    df = pd.read_csv(data_set_path)
    df = df[pd.to_numeric(df['video_id'], errors='coerce').notna()]
    df['video_id'] = df['video_id'].astype(int)

    user_recs, item_recs = get_collaborative_filter(data_set_path, user_id,top_n = k)

    final_scores = {}
    for vid, score in user_recs.items():
        final_scores[vid] = final_scores.get(vid, 0) + score * user_weight

    for vid, score in item_recs.items():
        final_scores[vid] = final_scores.get(vid, 0) + score * item_weight

    watched_video_ids = df[df['user_id'] == user_id]['video_id'].tolist()

    valid_watched_video_ids = [video_id for video_id in watched_video_ids if video_id in df['video_id'].values]

    valid_watched_video_indices = [df[df['video_id'] == video_id].index[0] for video_id in valid_watched_video_ids]

    content_recs = get_content_based_recommendations(csv_path=data_set_path,
                                                     watched_video_indices=valid_watched_video_indices,
                                                     top_n=k)

    for rec in content_recs:
        video_title = rec["title"]
        video_id = df[df['video_title'] == video_title]['video_id'].values[0]
        score = rec["score"]
        final_scores[video_id] = final_scores.get(video_id, 0) + score * content_weight

    sorted_recs = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)

    top_recs = sorted_recs[:n_recommendations]

    final_recommendations = []
    for video_id, score in top_recs:
        video_id = int(video_id)
        title = df[df['video_id'] == video_id]['video_title'].values[0] if len(df[df['video_id'] == video_id]) > 0 else "Unknown"
        final_recommendations.append({
            "video_id": video_id,
            "title": title
        })

    output_filename = f"hybrid_recommendations_user_{user_id}.json"
    with open(output_filename, "w", encoding='utf-8') as f:
        json.dump(final_recommendations, f, indent=4, ensure_ascii=False)

    print(f"\nHybrid recommendations saved to '{output_filename}'.")

def main():
    Dataset_path = "/content/modified_video_recommendation_dataset.csv"
    hybrid_recommendations_file = get_hybrid_recommendations(Dataset_path,
                                                             user_id=10,
                                                             user_weight=0.4,
                                                             item_weight=0.4,
                                                             content_weight=0.2,
                                                             k=20,
                                                             n_recommendations=20)
    print(f"Recommendations written to: {hybrid_recommendations_file}")

if __name__ == '__main__':
    main()



Hybrid recommendations saved to 'hybrid_recommendations_user_10.json'.
Recommendations written to: None


# Recommendation System Error Exceptions

In [None]:
class RecommendationSystemError(Exception):
    """Base class for all exceptions in the recommendation system."""
    pass

class InvalidMatrixError(RecommendationSystemError):
    """Raised when a matrix is invalid or does not meet the required conditions."""

    def __init__(self, message="Matrix is invalid"):
        self.message = message
        super().__init__(self.message)

class InvalidUserIdError(RecommendationSystemError):
    """Raised when a user ID is not found in the system."""

    def __init__(self, user_id, message="User ID not found"):
        self.user_id = user_id
        self.message = f"{message}: {user_id}"
        super().__init__(self.message)

class TrainingError(RecommendationSystemError):
    """Raised when an error occurs during model training."""

    def __init__(self, message="Model training failed"):
        self.message = message
        super().__init__(self.message)

class InvalidVideoIdError(RecommendationSystemError):
  """Raised when a video ID is not found in the system."""

  def __init__(self, video_id, message="Video ID not found"):
      self.video_id = video_id
      self.message = f"{message}: {video_id}"
      super().__init__(self.message)

class MissingRequiredData(RecommendationSystemError):
    """Raised when required data is missing."""

    def __init__(self, message="Missing required data"):
        self.message = message
        super().__init__(self.message)

class InvalidInputError(RecommendationSystemError):
    """Raised when invalid input is provided."""

    def __init__(self, message="Invalid input"):
        self.message = message
        super().__init__(self.message)