In [None]:
# For Building recommender systems
!pip install scikit-surprise



In [None]:
import joblib
import random # For generating random numbers or making random selections
import numpy as np # For numerical operations, such as arrays and mathematical functions
import pickle # For serializing and deserializing Python objects
import pandas as pd # For data manipulation and analysis, especially with DataFrames
from surprise import SVD, Dataset, Reader # For building recommendation systems
from sklearn.preprocessing import MinMaxScaler # To scale features to a specified range, typically [0, 1]
from datetime import datetime, timedelta #For handling date and time operations

In [None]:
news_df = pd.read_csv('/content/news.csv')
rec_items_df = pd.read_csv('/content/rec_items.csv')
rec_feedback_df = pd.read_csv('/content/rec_feedback.csv')
users_df = pd.read_csv('/content/users.csv')
rec_users_df = pd.read_csv('/content/rec_users.csv')

In [None]:
# Load the CBF model (TF-IDF and Cosine Similarity Matrix)
tfidf = joblib.load('/content/tfidf_vectorizer.pkl')
cosine_sim = joblib.load('/content/cosine_similarity_matrix.pkl')

# Load the CF model (e.g., a pre-trained collaborative filtering model)
cf_model = joblib.load('/content/svd_recommender_model.pkl')

In [None]:
def get_recent_trending_news(rec_feedback_df, top_n=5, days=7):
    """Fetch trending articles based on recent engagement (last 'days')."""

    # Check if the 'timestamp' column exists in the DataFrame
    if "timestamp" not in rec_feedback_df.columns:
        print("No timestamp column found. Falling back to general trending news.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    # Convert the 'timestamp' column to datetime format. Invalid timestamps are set to NaT (Not a Time)
    rec_feedback_df["timestamp"] = pd.to_datetime(rec_feedback_df["timestamp"], errors='coerce')

    # Check for invalid timestamps
    if rec_feedback_df["timestamp"].isnull().all():
        print("All timestamps are invalid. Falling back to general trending news.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    # Calculate the date for 'days' ago from the current date
    recent_date = datetime.now() - timedelta(days=days)

    # Filter the DataFrame to include only rows where the 'timestamp' is within the last 'days' period
    recent_engagements = rec_feedback_df[rec_feedback_df["timestamp"] >= recent_date]

    if recent_engagements.empty:
        print("No recent engagements found. Falling back to general trending news.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    # Get the top N most frequent items from the recent engagements
    trending = (
        recent_engagements["item_id"]
        .value_counts()
        .head(top_n)
        .index.tolist()
    )

    # Return the trending items if found, otherwise return an empty list
    return trending if trending else []

In [None]:
# Defines a function to calculate a dynamic "alpha" value based on a user's interaction history in rec_feedback_df
def calculate_user_alpha(user_id, rec_feedback_df):


    # Filters the DataFrame to get all rows where the 'user_id' matches the given user_id, representing all the user's interactions
    user_interactions = rec_feedback_df[rec_feedback_df["user_id"] == user_id]

    # Counts the total number of interactions (rows) for that user in the DataFrame
    total_interactions = len(user_interactions)

    if total_interactions == 0:
        return 0.5  # Returns a neutral alpha value (0.5) if no interactions are found

    # If the user has interactions, calculates the alpha as the ratio of interactions to 100, constrained between 0 and 1
    return min(1, max(0, total_interactions / 100))

In [None]:
def max_marginal_relevance(recommendations, hybrid_scores, cosine_sim, top_n=5):
    selected_items = []
    for item in recommendations:
        # Penalize items that are too similar to those already selected
        similarity_score = sum(cosine_sim.get(item, {}).get(other_item, 0) for other_item in selected_items)
        adjusted_score = hybrid_scores[item] - similarity_score * 0.7  # Increase the penalty to 0.7
        hybrid_scores[item] = adjusted_score

    # Sort by adjusted hybrid score and return the top N items
    return sorted(hybrid_scores.keys(), key=lambda x: hybrid_scores[x], reverse=True)[:top_n]

In [None]:
# Defines a function to normalize the scores to a range of 0-1 for fair weighting
def normalize_scores(scores):

    if not scores:
        return {} # If there are no scores, returns an empty dictionary

    # Converts the values of the 'scores' dictionary to a NumPy array and reshapes it into a column vector (for scaling)
    values = np.array(list(scores.values())).reshape(-1, 1)

    # Creates an instance of the MinMaxScaler, which scales values to the range [0, 1]
    scaler = MinMaxScaler()

    # Fits the scaler to the values and transforms them to the range [0, 1]
    normalized_values = scaler.fit_transform(values).flatten()

    # Returns a dictionary where each original score is mapped to its normalized value
    return {key: norm_score for key, norm_score in zip(scores.keys(), normalized_values)}

In [None]:
def recommend_content_based(news_id, top_n=5):
    # Create a mapping from item_id to index in the cosine similarity matrix
    id_to_index = {news_df['id'][i]: i for i in range(len(news_df))}

    # Check if news_id exists in the id_to_index mapping
    if news_id not in id_to_index:
        print(f"Item ID {news_id} not found in cosine similarity matrix")
        return []

    # Get the index of the item_id in the cosine similarity matrix
    index = id_to_index[news_id]

    # Fetch the cosine similarity values for the given news_id (row from the cosine_sim matrix)
    similar_items = list(enumerate(cosine_sim[index]))

    # Sort the similar items based on cosine similarity in descending order
    sorted_items = sorted(similar_items, key=lambda x: x[1], reverse=True)

    # Get the top N similar item IDs (ignoring the first item which is the item itself)
    recommended_items = [news_df['id'][item[0]] for item in sorted_items[1:top_n+1]]  # Skip the first item as it's the same as news_id

    # Get the titles of the recommended items
    recommended_titles = news_df[news_df['id'].isin(recommended_items)][['id', 'title']]

    return recommended_titles

In [None]:
# Defines a function to get the top N collaborative filtering recommendations using Singular Value Decomposition (SVD)
def recommend_collaborative(user_id, model, rec_feedback_df, top_n=5):

     # Checks if the user_id exists in the 'user_id' column of the rec_feedback_df DataFrame
    if user_id not in rec_feedback_df['user_id'].values:
        return []

    # Retrieves all unique item IDs from the 'item_id' column in the rec_feedback_df DataFrame
    all_items = rec_feedback_df['item_id'].unique()

    # Uses the SVD to predict the user's rating for each item
    predictions = {item: model.predict(user_id, item).est for item in all_items}

    # Sorts the items based on the predicted rating (est) in descending order (highest predicted rating first)
    # Returns the top N items with the highest predictions
    return sorted(predictions, key=predictions.get, reverse=True)[:top_n]

In [None]:
def recommend_all(user_id, rec_feedback_df, news_df, model=svd_model, top_n=5):
    """Return CF, CBF, and Hybrid recommendation systems for the user with article titles."""

    # Ensure column names are correct
    if 'id' not in news_df.columns:
        raise KeyError("'id' column not found in news_df")
    if 'item_id' not in rec_feedback_df.columns:
        raise KeyError("'item_id' column not found in rec_feedback_df")

    # Filter the DataFrame for user-specific interactions
    user_interactions = rec_feedback_df[rec_feedback_df["user_id"] == user_id]

    # If the user has no interactions, trigger fallback to trending news
    if user_interactions.empty:
        print(f"No data for user {user_id}. Showing recent trending news...")
        trending_ids = get_recent_trending_news(rec_feedback_df, top_n=top_n) or random.sample(list(rec_feedback_df["item_id"].unique()), top_n)
        return {
            "CF": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
            "CBF": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
            "Hybrid": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
        }

    # Calculate dynamic alpha based on the user's interaction history
    alpha = calculate_user_alpha(user_id, rec_feedback_df)

    # Fetch content-based recommendations (CBF)
    cbf_recommendations = recommend_content_based(user_id, top_n=top_n)

    # Fetch collaborative filtering recommendations (CF)
    cf_recommendations = recommend_collaborative(user_id, model, rec_feedback_df, top_n=top_n)

    # Assign cosine similarity scores for the CBF recommendations
    cbf_scores = {item: cosine_sim[user_id][item] for item in cbf_recommendations}

    # Assign predicted scores for the CF recommendations
    cf_scores = {item: model.predict(user_id, item).est for item in cf_recommendations}

    # Normalize both CBF and CF scores to a range of [0, 1]
    cbf_scores = normalize_scores(cbf_scores)
    cf_scores = normalize_scores(cf_scores)

    # Dictionary to store hybrid scores (CBF + CF)
    hybrid_scores = {}

    # Combine the scores from both filtering approaches using the alpha value
    for item in set(cbf_recommendations + cf_recommendations):
        cbf_score = cbf_scores.get(item, 0)  # Default to 0 if no CBF score
        cf_score = cf_scores.get(item, 0)  # Default to 0 if no CF score
        hybrid_scores[item] = alpha * cbf_score + (1 - alpha) * cf_score

    # Apply Maximal Marginal Relevance (MMR) to diversify the recommendations
    top_hybrid_recommendations = max_marginal_relevance(list(hybrid_scores.keys()), hybrid_scores, cosine_sim, top_n=top_n)

    # Get top recommendations for CF, CBF, and Hybrid
    top_cbf_recommendations = news_df[news_df["id"].isin(cbf_recommendations)].sort_values("id")[["id", "title"]].head(top_n)
    top_cf_recommendations = news_df[news_df["id"].isin(cf_recommendations)].sort_values("id")[["id", "title"]].head(top_n)
    top_hybrid_recommendations = news_df[news_df["id"].isin(top_hybrid_recommendations)].sort_values("id")[["id", "title"]].head(top_n)

    # Return a dictionary with CF, CBF, and Hybrid recommendations
    return {
        "CF": top_cf_recommendations,
        "CBF": top_cbf_recommendations,
        "Hybrid": top_hybrid_recommendations
    }

In [None]:
user_id = 2329
recommendations = recommend_all(user_id, rec_feedback_df, news_df)

# Output the recommendations
print(f"Top CF recommendations for User {user_id}:")
print(recommendations["CF"])

print(f"Top CBF recommendations for User {user_id}:")
print(recommendations["CBF"])

print(f"Top Hybrid recommendations for User {user_id}:")
print(recommendations["Hybrid"])

In [None]:
user_id = 2329
news_id = 23937

# Get content-based recommendations
cbf_recommendations = recommend_content_based(news_id)

# Output the recommendations
print("Top CBF recommendations:")
print(cbf_recommendations)