## Content-Based Filtering

In [1140]:
import pandas as pd # Data manipulation and analysis
import numpy as np # Numerical operations
import pickle # save and load serialized Python objects
import re # (regular expressions) Text processing and pattern matching
import nltk # (Natural Language Toolkit) Natural language processing
from sklearn.feature_extraction.text import TfidfVectorizer # convert text into numerical form (TF-IDF representation)
from sklearn.metrics.pairwise import cosine_similarity # Measure similarity between text data
from nltk.tokenize import word_tokenize # Split text into individual words

In [1141]:
news_df = pd.read_csv('/content/news.csv')
rec_items_df = pd.read_csv('/content/rec_items.csv')
rec_feedback_df = pd.read_csv('/content/rec_feedback.csv')
users_df = pd.read_csv('/content/users.csv')
rec_users_df = pd.read_csv('/content/rec_users.csv')

In [1142]:
# Displays the first 5 rows of the news dataset
news_df.head()

Unnamed: 0,id,title,description,published_date,breaking_news,blob_image,source_url,created_at,updated_at,published_at,created_by_id,updated_by_id,shares,comment_count,type
0,23996,අද ඩොලරයේ අගය,ශ්‍රී ලංකා මහ බැංකුව විසින් අද (13) දින නිකුත්...,2025-02-13 12:10:32.919,False,,https://www.hirunews.lk/396721/%E0%B6%85%E0%B6...,2025-02-13 09:43:07.843,2025-02-13 09:43:12.403,2025-02-13 09:43:12.131,7.0,7.0,0,0,News
1,23995,‘ක්ලීන් ශ‍්‍රී ලංකා ලිඛිතව තියෙනවා.’ මාලිමා මන...,‘ක්‍ලීන් ශ්‍රී ලංකා’ වැඩපිළිවෙල යනු කුමක්දැයි ...,2025-02-13 09:30:00,False,,https://lankacnews.com/%e0%b6%9a%e0%b7%8a%e0%b...,2025-02-13 09:42:37.114,2025-02-13 09:42:47.003,,,,0,0,News
2,23994,පාපන්දු ගෝල කණුවක් කඩා වැටී පාසැල් සිසුවෙකු ජී...,පාසැල් ක්‍රීඩාගංනයක තිබූ පාපන්දු ගෝල කණුවක් කඩ...,2025-02-13 11:10:47.133,False,,https://www.hirunews.lk/396719/%E0%B6%B4%E0%B7...,2025-02-13 09:33:20.185,2025-02-13 09:33:23.796,2025-02-13 09:33:23.747,7.0,7.0,0,0,News
3,23993,සුජීව සේනසිංහගේ මූලික අයිතිවාසිකම් පෙත්සම විභා...,තමන්ට එරෙහිව අපරාධ පරීක්ෂණ දෙපාර්තමේන්තුව විසි...,2025-02-13 10:10:29.598,False,,https://www.hirunews.lk/396716/%E0%B7%83%E0%B7...,2025-02-13 09:31:55.442,2025-02-13 09:31:59.501,2025-02-13 09:31:59.41,7.0,7.0,0,0,News
4,23992,නීති විරෝධී ධීවර දැල් දෙසීය පනහක් නීතියේ රැහැනට,"ශ්‍රී ලංකා නාවික හමුදාව, කිලිනොච්චිය මුද්දලම්ප...",2025-02-13 10:58:56.639,False,,https://www.dinamina.lk/2025/02/13/lawnorder/1...,2025-02-13 09:29:23.368,2025-02-13 09:29:26.299,2025-02-13 09:29:26.258,7.0,7.0,0,0,News


In [1143]:
# Displays the data types of each column
news_df.dtypes

Unnamed: 0,0
id,int64
title,object
description,object
published_date,object
breaking_news,bool
blob_image,float64
source_url,object
created_at,object
updated_at,object
published_at,object


In [1144]:
news_df.columns

Index(['id', 'title', 'description', 'published_date', 'breaking_news',
       'blob_image', 'source_url', 'created_at', 'updated_at', 'published_at',
       'created_by_id', 'updated_by_id', 'shares', 'comment_count', 'type'],
      dtype='object')

In [1145]:
# Prints the number of missing (null) values in each column
print(news_df.isnull().sum())

id                   0
title                0
description          0
published_date       0
breaking_news        0
blob_image        1000
source_url           0
created_at           0
updated_at           0
published_at        35
created_by_id      382
updated_by_id       35
shares               0
comment_count        0
type                 0
dtype: int64


In [1146]:
# Removes the specified columns from the DataFrame
news_df.drop(columns=["blob_image", "updated_by_id", "created_by_id", "shares", "comment_count", "type"], inplace=True)

In [1147]:
# Fills missing (NaN) values in a column with the most frequently occurring value (mode).

news_df['published_at'] = news_df['published_at'].fillna(news_df['published_at'].mode()[0])

In [1148]:
print(news_df.isnull().sum())

id                0
title             0
description       0
published_date    0
breaking_news     0
source_url        0
created_at        0
updated_at        0
published_at      0
dtype: int64


In [1149]:
# Helpful for text analysis or natural language processing tasks
news_df['content'] = news_df['title'] + " " + news_df['description']

In [1150]:
# Stop words which are don't carry significant meaning

sinhala_stop_words = [
    "අතර", "ඉන්", "එක", "එය", "ඔබ", "ඔයා", "ඔහු", "ඔවුන්",
    "ඕනෑ", "ආදිය", "ඇයි", "ඇතුලත", "ඉන්පසු",
    "ඉස්සර", "එක්", "ඔය", "ඔයාලා", "ඔයාව", "උදාහරණ", "උපුටා",
    "ඉදිරියට","එදා", "ඒ", "ඒවා", "ඕන", "ඔක්කෝම", "ඔවුන්",
    "අප", "අපේ", "ඇතුළත්", "ඇත", "ඊයේ", "ඔබට", "ඔබගේ", "එහි",
    "එන්න","ඕක", "ඉහළ", "ඔබේ", "අය", "ඔව්",
    "අද", "ඉතා", "ඉතාම", "ඉහත", "අවශ්‍ය", "ඔයාලට"
]

In [1151]:
def preprocess_sinhala_text(text):
    text = str(text) # Ensure the input is a string
    text = re.sub(r'[^\u0D80-\u0DFF\s]', '', text) # Remove non-Sinhala characters except spaces
    text = text.strip() # Remove leading and trailing spaces
    tokens = word_tokenize(text) # Tokenize the text into words
    tokens = [word for word in tokens if word not in sinhala_stop_words] # Remove stop words
    return " ".join(tokens) # Join tokens back into a cleaned string

In [1152]:
# Download tokenizer models
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1153]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [1154]:
# Apply text preprocessing to each row in the 'content' column
news_df['content'] = news_df['content'].apply(preprocess_sinhala_text)

In [1155]:
tfidf_vectorizer = TfidfVectorizer() # Create a TF-IDF vectorizer instance
tfidf_matrix = tfidf_vectorizer.fit_transform(news_df['content']) # Convert the 'content' column into a TF-IDF matrix

In [1156]:
# Compute cosine similarity between all text entries
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) # Measures how similar each text entry in news_df['content'] is to every other entry

In [1157]:
# Create a DataFrame to store cosine similarity scores with 'id' as index and columns
cosine_sim_df = pd.DataFrame(cosine_sim, index=news_df['id'], columns=news_df['id'])

In [1158]:
with open('cbf_model_cosine_sim.pkl', 'wb') as f: # Open a file in write-binary mode to save the cosine similarity model
    pickle.dump(cosine_sim, f) # Serialize and save the cosine similarity matrix to the file

In [1159]:
with open('cbf_model_tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)  # Serialize and save the TF-IDF vectorizer

In [1160]:
# Function to recommend articles based on content similarity
def recommend_content_based(news_id, cosine_sim=cosine_sim, top_n=5):
    if news_id not in news_df['id'].values:  # Check if the provided news_id exists in the 'id' column of the DataFrame
        return []  # Return an empty list if the news_id does not exist

    idx = news_df[news_df['id'] == news_id].index[0]  # Get the index of the article with the given news_id
    sim_scores = list(enumerate(cosine_sim[idx]))  # Retrieve the cosine similarity scores for the article at the given index
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # Sort the similarity scores in descending order
    sim_scores = sim_scores[1:top_n+1]  # Get the top N most similar articles (excluding the article itself)
    recommended_indices = [x[0] for x in sim_scores]  # Extract the indices of the most similar articles

    return news_df.iloc[recommended_indices]['id'].tolist()  # Return the 'id' of the recommended articles as a list

In [1161]:
import ast  # To safely evaluate the string as a literal

def recommend_for_user(user_id, top_n=5):
    # Fetch user preferences (interests) from rec_users dataset
    user_preferences_str = rec_users_df.loc[rec_users_df['user_id'] == user_id, 'labels'].values

    if len(user_preferences_str) == 0:
        print(f"No preferences found for user {user_id}")
        return pd.DataFrame()  # Return an empty DataFrame if no preferences are found

    # Convert string representation of list to an actual list
    user_preferences = ast.literal_eval(user_preferences_str[0])  # Safely evaluate the string into a list

    if not user_preferences:  # Handle case where the list is empty
        print(f"User {user_id} has no preferences.")
        return pd.DataFrame()  # Return empty if no preferences are available

    # Debugging: Check user preferences
    print(f"User {user_id} preferences: {user_preferences}")

    # Filter news articles based on these preferences
    recommended_articles = news_df[news_df['content'].apply(lambda x: any(label in x for label in user_preferences))]

    if recommended_articles.empty:
        print(f"No articles match the preferences for user {user_id}")
        return pd.DataFrame()  # Return empty if no match is found

    # Sort the recommended articles based on relevance
    recommended_articles = recommended_articles.sort_values(by='published_date', ascending=False).head(top_n)

    # Debugging: Check the recommended articles for the user
    print(f"Recommended articles for user {user_id}:")

    # Return the recommended article details (ID, title)
    return recommended_articles[['id', 'title']]

In [1162]:
user_id = 2329
recommended_articles_for_user = recommend_for_user(user_id, top_n=5)
print(recommended_articles_for_user)

User 2329 preferences: ['විදේශීය පුවත්', 'ගොසිප්', 'ක්\u200dරීඩා ', 'දේශීය පුවත්', 'දේශපාලන', 'විශේෂාංග', 'විද්\u200dයාත්මක', 'තාක්ෂණික', 'විශේෂ පුවත්', 'පාරිසරික', 'කාටූන්', 'කලා', 'ආර්ථික']
Recommended articles for user 2329:
       id                                              title
49  23947                   DeepSeek දැන් Huawei දුරකතන වලත්
15  23981     අඛණ්ඩ විදුලි සැපයුම සඳහා සජිත් ස්විස් සහය පතයි
51  23945                මොනවද මේ රත්තරන් ව​ල කැරට් කියන්නේ?
16  23980  එලොන් මස්ක් සිය එක්ස් පුතා වඩාගෙන ට්‍රම්ප්ගේ ඔ...
4   23992    නීති විරෝධී ධීවර දැල් දෙසීය පනහක් නීතියේ රැහැනට


In [1163]:
user_id_1 = 2045
recommended_articles_for_user = recommend_for_user(user_id_1, top_n=5)
print(recommended_articles_for_user)

User 2045 preferences: ['විදේශීය පුවත්', 'ක්\u200dරීඩා ', 'දේශීය පුවත්', 'දේශපාලන', 'විශේෂ පුවත්']
Recommended articles for user 2045:
       id                                              title
16  23980  එලොන් මස්ක් සිය එක්ස් පුතා වඩාගෙන ට්‍රම්ප්ගේ ඔ...
1   23995  ‘ක්ලීන් ශ‍්‍රී ලංකා ලිඛිතව තියෙනවා.’ මාලිමා මන...
55  23941  ජනපති ලෝක නායකයන් ඇමතූ දේශණයේ ‘ඉන්දීය මහද්වීපය...
38  23958   නැගෙනහිර බහලු පර්යන්තයේ මාර්ග සැලැස්ම වෙනස් කරලා
31  23965         යාපනයේ තිස්ස විහාරය ඉවත් කරන්නැයි විරෝධතා.


In [1164]:
user_id = 2033
recommended_articles_for_user = recommend_for_user(user_id, top_n=5)
print(recommended_articles_for_user)

User 2033 has no preferences.
Empty DataFrame
Columns: []
Index: []


## Collaborative Filtering

In [1165]:
# For Building recommender systems
!pip install scikit-surprise



In [1166]:
import pandas as pd # For handling data in DataFrame format
import numpy as np # For numerical operations and array handling
import pickle # For serializing and saving Python objects
from sklearn.neighbors import NearestNeighbors # For building nearest neighbor models
from surprise import SVD, Dataset, Reader # (Singular Value Decomposition) Dataset/Reader classes from Surprise for collaborative filtering-based recommender systems
from surprise.model_selection import train_test_split # For splitting data into training and test sets
import datetime # For working with dates and times (Helpful for timestamping data or filtering by dates)

In [1167]:
rec_feedback_df = pd.read_csv('rec_feedback.csv')

In [1168]:
rec_feedback_df.head() # Display the first 5 rows

Unnamed: 0,id,feedback_type,time_stamp,comment,user_id,item_id,created_at,updated_at,created_by_id,updated_by_id
0,49554,click,2025-02-13 10:03:47.440412,,1182,23942,,,,
1,49553,read,2025-02-13 10:03:42.411734,,1182,23942,,,,
2,49552,click,2025-02-13 10:01:54.947236,,1058,23942,,,,
3,49550,click,2025-02-13 10:01:31.955569,,1058,23930,,,,
4,49549,click,2025-02-13 10:01:10.809916,,1058,23941,,,,


In [1169]:
# Display the data types of each column
rec_feedback_df.dtypes

Unnamed: 0,0
id,int64
feedback_type,object
time_stamp,object
comment,float64
user_id,int64
item_id,int64
created_at,float64
updated_at,float64
created_by_id,float64
updated_by_id,float64


In [1170]:
# Print the number of missing (null) values
print(rec_feedback_df.isnull().sum())

id                  0
feedback_type       0
time_stamp          0
comment          1000
user_id             0
item_id             0
created_at       1000
updated_at       1000
created_by_id    1000
updated_by_id    1000
dtype: int64


In [1171]:
# Drop unnecessary columns from the DataFrame
rec_feedback_df.drop(columns=["comment", "created_at", "updated_at", "created_by_id", "updated_by_id"], inplace=True)

In [1172]:
print(rec_feedback_df.isnull().sum())

id               0
feedback_type    0
time_stamp       0
user_id          0
item_id          0
dtype: int64


In [1173]:
interaction_weights = {'click': 1, 'read': 2, 'like': 3} # Define a dictionary mapping interaction types to numerical weights
rec_feedback_df['weight'] = rec_feedback_df['feedback_type'].map(interaction_weights) # Map the interaction type to its corresponding weight and create a new 'weight' column

In [1174]:
# Convert 'time_stamp' column to datetime format
rec_feedback_df["time_stamp"] = pd.to_datetime(rec_feedback_df["time_stamp"], errors='coerce')

# Handle missing values if necessary (drop rows with NaN in the 'time_stamp' column)
rec_feedback_df = rec_feedback_df.dropna(subset=["time_stamp"])

In [1175]:
from datetime import datetime

current_time = datetime.now() # Get the current date and time as a datetime object
print(current_time)

2025-02-24 11:14:40.003760


In [1176]:
current_time = datetime.now() # Get the current date and time
time_decay_factor = 0.9 # Set the time decay factor, which determines how quickly the weight decays over time
rec_feedback_df['time_weight'] = rec_feedback_df['time_stamp'].apply(
    lambda x: time_decay_factor ** ((current_time - x).days)
) # Apply a time decay to each timestamp (Useful for give more importance to recent interactions)

In [1177]:
# Calculate the final weight by multiplying the interaction weight with the time decay weight
rec_feedback_df['final_weight'] = rec_feedback_df['weight'] * rec_feedback_df['time_weight']

print(rec_feedback_df[['weight', 'time_weight', 'final_weight']])

     weight  time_weight  final_weight
0       1.0     0.313811      0.313811
1       2.0     0.313811      0.627621
2       1.0     0.313811      0.313811
3       1.0     0.313811      0.313811
4       1.0     0.313811      0.313811
..      ...          ...           ...
995     1.0     0.228768      0.228768
996     1.0     0.228768      0.228768
997     2.0     0.228768      0.457536
998     1.0     0.228768      0.228768
999     1.0     0.228768      0.228768

[1000 rows x 3 columns]


In [1178]:
from surprise import accuracy

# Ensure there are no NaN values in final_weight
rec_feedback_df['final_weight'] = rec_feedback_df['final_weight'].fillna(0)

# Check the number of zero weights and inspect if needed
print(f"Zero weights count: {(rec_feedback_df['final_weight'] == 0).sum()}")

# Check if any weights are within the expected range
print(f"Max final_weight: {rec_feedback_df['final_weight'].max()}, Min final_weight: {rec_feedback_df['final_weight'].min()}")

# Re-load data and create train-test split
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(rec_feedback_df[['user_id', 'item_id', 'final_weight']], reader)

# Split the data into train and test
trainset, testset = train_test_split(data, test_size=0.2)

# Train the SVD model
svd = SVD()
svd.fit(trainset)

# Evaluate model performance on the test set
predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

# Output the evaluation metrics
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

Zero weights count: 1
Max final_weight: 0.9414317882700002, Min final_weight: 0.0
RMSE: 0.1494
MAE:  0.1252
RMSE: 0.14940331267371362
MAE: 0.12515617241399274


In [1179]:
# Save the trained model to a file using pickle
with open('collaborative_model.pkl', 'wb') as f:
    pickle.dump(svd, f)

In [1180]:
# Load the trained model
with open('collaborative_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [1181]:
user_id = 1058
item_id = 23942

# Predict the rating for the given user-item pair
predicted_rating = loaded_model.predict(user_id, item_id).est
print(f"Predicted rating for user {user_id} on item {item_id}: {predicted_rating}")

Predicted rating for user 1058 on item 23942: 0.3381088522413136


In [1182]:
user_id = 2329
item_id = 23901

# Predict the rating for the given user-item pair
predicted_rating = loaded_model.predict(user_id, item_id).est
print(f"Predicted rating for user {user_id} on item {item_id}: {predicted_rating}")

Predicted rating for user 2329 on item 23901: 0.42865543141437823


In [1197]:
# Get all item IDs (make sure these correspond to the items in your dataset)
all_item_ids = rec_feedback_df['item_id'].unique()

# Get the top N predictions for user 2329
def recommend_top_n(user_id, top_n=5):
    predictions = []
    for item_id in all_item_ids:
        predicted_rating = loaded_model.predict(user_id, item_id).est
        predictions.append((item_id, predicted_rating))

    # Sort the predictions by predicted rating in descending order
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Get top N recommended items
    return predictions[:top_n]

# Get the top 5 recommended items for user 2329
top_n_recommendations = recommend_top_n(2329, top_n=5)
for idx, (item_id, predicted_rating) in enumerate(top_n_recommendations):
    print(f"Rank {idx+1}: Item ID: {item_id}, Predicted Rating: {predicted_rating:.2f}")

Rank 1: Item ID: 23340, Predicted Rating: 0.59
Rank 2: Item ID: 23940, Predicted Rating: 0.57
Rank 3: Item ID: 22904, Predicted Rating: 0.55
Rank 4: Item ID: 23764, Predicted Rating: 0.54
Rank 5: Item ID: 22641, Predicted Rating: 0.54


## Hybrid Filtering

In [1183]:
import random # For generating random numbers or making random selections
import numpy as np # For numerical operations, such as arrays and mathematical functions
import pickle # For serializing and deserializing Python objects
import pandas as pd # For data manipulation and analysis, especially with DataFrames
from surprise import SVD, Dataset, Reader # For building recommendation systems
from sklearn.preprocessing import MinMaxScaler # To scale features to a specified range, typically [0, 1]
from datetime import datetime, timedelta #For handling date and time operations

In [1184]:
with open('cbf_model_cosine_sim.pkl', 'rb') as f:
    cosine_sim = pickle.load(f)

with open('cbf_model_tfidf.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

with open('collaborative_model.pkl', 'rb') as f:
    knn = pickle.load(f)

In [1185]:
def get_recent_trending_news(rec_feedback_df, top_n=5, days=7):
    """Fetch trending articles based on recent engagement (last 'days')."""

    # Check if the 'timestamp' column exists in the DataFrame
    if "timestamp" not in rec_feedback_df.columns:
        print("No timestamp column found. Falling back to general trending news.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    # Convert the 'timestamp' column to datetime format. Invalid timestamps are set to NaT (Not a Time)
    rec_feedback_df["timestamp"] = pd.to_datetime(rec_feedback_df["timestamp"], errors='coerce')

    # Check for invalid timestamps
    if rec_feedback_df["timestamp"].isnull().all():
        print("All timestamps are invalid. Falling back to general trending news.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    # Calculate the date for 'days' ago from the current date
    recent_date = datetime.now() - timedelta(days=days)

    # Filter the DataFrame to include only rows where the 'timestamp' is within the last 'days' period
    recent_engagements = rec_feedback_df[rec_feedback_df["timestamp"] >= recent_date]

    if recent_engagements.empty:
        print("No recent engagements found. Falling back to general trending news.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    # Get the top N most frequent items from the recent engagements
    trending = (
        recent_engagements["item_id"]
        .value_counts()
        .head(top_n)
        .index.tolist()
    )

    # Return the trending items if found, otherwise return an empty list
    return trending if trending else []

In [1186]:
print(rec_feedback_df["time_stamp"].head())

0   2025-02-13 10:03:47.440412
1   2025-02-13 10:03:42.411734
2   2025-02-13 10:01:54.947236
3   2025-02-13 10:01:31.955569
4   2025-02-13 10:01:10.809916
Name: time_stamp, dtype: datetime64[ns]


In [1187]:
# Defines a function to calculate a dynamic "alpha" value based on a user's interaction history in rec_feedback_df
def calculate_user_alpha(user_id, rec_feedback_df):


    # Filters the DataFrame to get all rows where the 'user_id' matches the given user_id, representing all the user's interactions
    user_interactions = rec_feedback_df[rec_feedback_df["user_id"] == user_id]

    # Counts the total number of interactions (rows) for that user in the DataFrame
    total_interactions = len(user_interactions)

    if total_interactions == 0:
        return 0.5  # Returns a neutral alpha value (0.5) if no interactions are found

    # If the user has interactions, calculates the alpha as the ratio of interactions to 100, constrained between 0 and 1
    return min(1, max(0, total_interactions / 100))

In [1188]:
def max_marginal_relevance(recommendations, hybrid_scores, cosine_sim, top_n=5):
    selected_items = []
    for item in recommendations:
        # Penalize items that are too similar to those already selected
        similarity_score = sum(cosine_sim.get(item, {}).get(other_item, 0) for other_item in selected_items)
        adjusted_score = hybrid_scores[item] - similarity_score * 0.7  # Increase the penalty to 0.7
        hybrid_scores[item] = adjusted_score

    # Sort by adjusted hybrid score and return the top N items
    return sorted(hybrid_scores.keys(), key=lambda x: hybrid_scores[x], reverse=True)[:top_n]

In [1189]:
# Defines a function to normalize the scores to a range of 0-1 for fair weighting
def normalize_scores(scores):

    if not scores:
        return {} # If there are no scores, returns an empty dictionary

    # Converts the values of the 'scores' dictionary to a NumPy array and reshapes it into a column vector (for scaling)
    values = np.array(list(scores.values())).reshape(-1, 1)

    # Creates an instance of the MinMaxScaler, which scales values to the range [0, 1]
    scaler = MinMaxScaler()

    # Fits the scaler to the values and transforms them to the range [0, 1]
    normalized_values = scaler.fit_transform(values).flatten()

    # Returns a dictionary where each original score is mapped to its normalized value
    return {key: norm_score for key, norm_score in zip(scores.keys(), normalized_values)}

In [1190]:
 # Defines a function to get the top N content-based recommendations using cosine similarity
def recommend_content_based(news_id, top_n=5):

    # Checks if the given news_id exists in the cosine_sim dictionary
    if news_id not in cosine_sim:
        return [] # If the news_id is not found, return an empty list

    # Creates a list of tuples, where each tuple contains an index (item ID) and its cosine similarity value to the given news_id
    similar_items = list(enumerate(cosine_sim[news_id]))

    # Sorts the similar items by their cosine similarity value in descending order (most similar first)
    sorted_items = sorted(similar_items, key=lambda x: x[1], reverse=True)

    # Returns a list of the top N most similar item IDs, based on the sorted cosine similarity values
    return [item[0] for item in sorted_items[:top_n]]

In [1191]:
# Defines a function to get the top N collaborative filtering recommendations using Singular Value Decomposition (SVD)
def recommend_collaborative(user_id, model, rec_feedback_df, top_n=5):

     # Checks if the user_id exists in the 'user_id' column of the rec_feedback_df DataFrame
    if user_id not in rec_feedback_df['user_id'].values:
        return []

    # Retrieves all unique item IDs from the 'item_id' column in the rec_feedback_df DataFrame
    all_items = rec_feedback_df['item_id'].unique()

    # Uses the SVD to predict the user's rating for each item
    predictions = {item: model.predict(user_id, item).est for item in all_items}

    # Sorts the items based on the predicted rating (est) in descending order (highest predicted rating first)
    # Returns the top N items with the highest predictions
    return sorted(predictions, key=predictions.get, reverse=True)[:top_n]

In [1192]:
def recommend_hybrid(user_id, news_id, rec_feedback_df, model=svd, top_n=5):
    """Hybrid recommendation system combining content-based and collaborative filtering."""

    # Filter the DataFrame for user-specific interactions
    user_interactions = rec_feedback_df[rec_feedback_df["user_id"] == user_id]

    # If the user has no interactions, trigger fallback to trending news
    if user_interactions.empty:
        print(f"No data for user {user_id}. Showing recent trending news...")
        return get_recent_trending_news(rec_feedback_df, top_n=top_n) or random.sample(list(rec_feedback_df["item_id"].unique()), top_n)

    # Calculate dynamic alpha based on the user's interaction history
    alpha = calculate_user_alpha(user_id, rec_feedback_df)

    # Fetch content-based recommendations (CBF) for the given news_id
    cbf_recommendations = recommend_content_based(news_id, top_n=top_n)

    # Fetch collaborative filtering recommendations (CF) for the given user
    cf_recommendations = recommend_collaborative(user_id, model, rec_feedback_df, top_n=top_n)

    # Assign cosine similarity scores for the CBF recommendations
    cbf_scores = {item: cosine_sim[news_id][item] for item in cbf_recommendations}

    # Assign predicted scores for the CF recommendations
    cf_scores = {item: model.predict(user_id, item).est for item in cf_recommendations}

    # Normalize both CBF and CF scores to a range of [0, 1]
    cbf_scores = normalize_scores(cbf_scores)
    cf_scores = normalize_scores(cf_scores)

    # Dictionary to store hybrid scores (CBF + CF)
    hybrid_scores = {}

    # Combine the scores from both filtering approaches using the alpha value
    for item in set(cbf_recommendations + cf_recommendations):
        cbf_score = cbf_scores.get(item, 0)  # Default to 0 if no CBF score
        cf_score = cf_scores.get(item, 0)  # Default to 0 if no CF score
        hybrid_scores[item] = alpha * cbf_score + (1 - alpha) * cf_score

    # Apply Maximal Marginal Relevance (MMR) to diversify the recommendations
    top_recommendations = max_marginal_relevance(list(hybrid_scores.keys()), hybrid_scores, cosine_sim, top_n=top_n)

    # If no recommendations were found, trigger fallback to trending news
    if not top_recommendations:
        print(f"No recommendations found for user {user_id}. Showing recent trending news...")
        return get_recent_trending_news(rec_feedback_df, top_n=top_n) or random.sample(list(rec_feedback_df["item_id"].unique()), top_n)

    # Return the top N hybrid recommendations
    return top_recommendations

In [1193]:
with open('hybrid_recommendation_model.pkl', 'wb') as f:
    pickle.dump(svd, f)

In [1194]:
user_id_1 = 2329
news_id_1 = 20100

recommendations_1 = recommend_hybrid(user_id_1, news_id_1, rec_feedback_df, top_n=5)
print(f"User {user_id_1} Recommendations:", recommendations_1)

User 2329 Recommendations: [23940, 23340, 22641, 23764, 22904]


In [1195]:
user_id_2 = 700
news_id_2 = 23770

recommendations_2 = recommend_hybrid(user_id_2, news_id_2, rec_feedback_df, top_n=5)
print(f"User {user_id_2} Recommendations:", recommendations_2)

User 700 Recommendations: [16519, 23377, 23849, 23959, 23469]


In [1196]:
# New user with no history
user_id_new = 9999
news_id_new = 23980
recommendations_new = recommend_hybrid(user_id_new, news_id_new, rec_feedback_df, top_n=5)
print(f"User {user_id_new} (no history) - Fallback Recommendations:", recommendations_new)

No data for user 9999. Showing recent trending news...
No timestamp column found. Falling back to general trending news.
User 9999 (no history) - Fallback Recommendations: [23749, 23769, 23820, 23930, 23786]
