In [341]:
# For Building recommender systems
!pip install scikit-surprise



In [342]:
import pandas as pd
from datetime import datetime
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate
from surprise.accuracy import rmse
import joblib

In [343]:
news_df = pd.read_csv('/content/news.csv')
rec_items_df = pd.read_csv('/content/rec_items.csv')
rec_feedback_df = pd.read_csv('/content/rec_feedback.csv')
users_df = pd.read_csv('/content/users.csv')
rec_users_df = pd.read_csv('/content/rec_users.csv')

In [344]:
rec_feedback_df.head() # Display the first 5 rows

Unnamed: 0,id,feedback_type,time_stamp,comment,user_id,item_id,created_at,updated_at,created_by_id,updated_by_id
0,49554,click,2025-02-13 10:03:47.440412,,1182,23942,,,,
1,49553,read,2025-02-13 10:03:42.411734,,1182,23942,,,,
2,49552,click,2025-02-13 10:01:54.947236,,1058,23942,,,,
3,49550,click,2025-02-13 10:01:31.955569,,1058,23930,,,,
4,49549,click,2025-02-13 10:01:10.809916,,1058,23941,,,,


In [345]:
# Display the data types of each column
rec_feedback_df.dtypes

Unnamed: 0,0
id,int64
feedback_type,object
time_stamp,object
comment,float64
user_id,int64
item_id,int64
created_at,float64
updated_at,float64
created_by_id,float64
updated_by_id,float64


In [346]:
# Print the number of missing (null) values
print(rec_feedback_df.isnull().sum())

id                  0
feedback_type       0
time_stamp          0
comment          1000
user_id             0
item_id             0
created_at       1000
updated_at       1000
created_by_id    1000
updated_by_id    1000
dtype: int64


In [347]:
# Drop unnecessary columns from the DataFrame
rec_feedback_df.drop(columns=["comment", "created_at", "updated_at", "created_by_id", "updated_by_id"], inplace=True)

In [348]:
print(rec_feedback_df.isnull().sum())

id               0
feedback_type    0
time_stamp       0
user_id          0
item_id          0
dtype: int64


In [349]:
interaction_weights = {'click': 1, 'read': 2, 'like': 3} # Define a dictionary mapping interaction types to numerical weights
rec_feedback_df['weight'] = rec_feedback_df['feedback_type'].map(interaction_weights) # Map the interaction type to its corresponding weight and create a new 'weight' column

In [350]:
# Convert 'time_stamp' column to datetime format
rec_feedback_df["time_stamp"] = pd.to_datetime(rec_feedback_df["time_stamp"], errors='coerce')

In [351]:
# Handle missing values by dropping rows with NaN timestamps
rec_feedback_df.dropna(subset=["time_stamp"], inplace=True)

In [352]:
from datetime import datetime

current_time = datetime.now() # Get the current date and time as a datetime object
print(current_time)

2025-02-25 11:05:53.486589


In [353]:
# Apply time decay factor
current_time = datetime.now()
time_decay_factor = 0.9
rec_feedback_df['time_weight'] = rec_feedback_df['time_stamp'].apply(
    lambda x: time_decay_factor ** ((current_time - x).days)
)

In [354]:
# Calculate the final weight by multiplying the interaction weight with the time decay weight
rec_feedback_df['final_weight'] = rec_feedback_df['weight'] * rec_feedback_df['time_weight']

print(rec_feedback_df[['weight', 'time_weight', 'final_weight']])

     weight  time_weight  final_weight
0       1.0     0.282430      0.282430
1       2.0     0.282430      0.564859
2       1.0     0.282430      0.282430
3       1.0     0.282430      0.282430
4       1.0     0.282430      0.282430
..      ...          ...           ...
995     1.0     0.205891      0.205891
996     1.0     0.205891      0.205891
997     2.0     0.205891      0.411782
998     1.0     0.205891      0.205891
999     1.0     0.205891      0.205891

[1000 rows x 3 columns]


In [355]:
# Prepare data for Surprise library
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(rec_feedback_df[['user_id', 'item_id', 'final_weight']], reader)

In [356]:
# Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [357]:
# Train an SVD model
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7e288b17e990>

In [358]:
# Function to get top-N recommendations ensuring unseen items only
def get_top_n_recommendations(model, user_id, n=5):
    unique_items = set(rec_feedback_df['item_id'].unique())
    user_items = set(rec_feedback_df[rec_feedback_df['user_id'] == user_id]['item_id'].tolist())
    unseen_items = unique_items - user_items  # Exclude already interacted items

    # If the user is new or has interacted with all items, return popular items
    if not unseen_items:
        popular_items = rec_feedback_df.groupby('item_id')['weight'].sum().sort_values(ascending=False).index[:n].tolist()
        return user_items, unseen_items, popular_items

    # Predict ratings for unseen items only
    predictions = [(item, model.predict(user_id, item).est) for item in unseen_items]

    # Sort by estimated rating and return top-N
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    recommended_items = [item for item, _ in top_n]

    return user_items, unseen_items, recommended_items

In [359]:
# Save the model using joblib
joblib.dump(model, 'svd_recommender_model.pkl')

['svd_recommender_model.pkl']

In [360]:
# Load the model from the saved file
loaded_model = joblib.load('svd_recommender_model.pkl')

In [361]:
target_user_id = 2329
seen_items, unseen_items, recommended_items = get_top_n_recommendations(model, target_user_id, n=5)
print("Seen items for user", target_user_id, ":", seen_items)
print("Unseen items for user", target_user_id, ":", unseen_items)
print("Recommended items for user", target_user_id, ":", recommended_items)

Seen items for user 2329 : {22528, 23042, 23557, 23559, 23572, 22551, 23580, 16418, 23590, 23595, 15921, 23601, 23094, 21564, 23103, 23625, 23628, 22604, 23124, 23637, 16983, 23640, 23647, 23650, 23660, 23152, 23680, 23681, 21634, 20100, 18566, 20102, 21638, 20628, 22167, 22683, 21687, 19652, 19663, 21726, 23263, 23776, 16609, 23777, 23784, 23785, 20717, 20719, 23291, 23803, 21757, 22784, 20224, 21250, 23811, 22788, 23812, 23835, 20252, 19744, 23844, 23849, 23850, 16686, 23350, 16697, 23360, 16721, 18771, 16726, 23901, 23912, 23916, 22894, 22895, 22393, 20349, 22398, 22401, 22927, 22928, 22929, 15257, 23452, 23454, 23461, 23467, 23468, 23473, 21948, 23501, 22999, 23519, 23522, 23524, 23530, 16888, 20473, 22526, 22527}
Unseen items for user 2329 : {23565, 16432, 20016, 23603, 23609, 22090, 23118, 22612, 23639, 22618, 18530, 23140, 23652, 23653, 17513, 22307, 23149, 23150, 23662, 23664, 22641, 17521, 23666, 23668, 23679, 16519, 23689, 23692, 23693, 23695, 17557, 23702, 23708, 23711, 2371

In [371]:
target_user_id = 704
seen_items, unseen_items, recommended_items = get_top_n_recommendations(model, target_user_id, n=5)
print("Seen items for user", target_user_id, ":", seen_items)
print("Unseen items for user", target_user_id, ":", unseen_items)
print("Recommended items for user", target_user_id, ":", recommended_items)

Seen items for user 704 : {23930, 23796}
Unseen items for user 704 : {22528, 23557, 23559, 23565, 23572, 22551, 23580, 16418, 23590, 23595, 16432, 23601, 23603, 23609, 21564, 23625, 22604, 23628, 22612, 23637, 23639, 23640, 22618, 23647, 18530, 23650, 23652, 23653, 17513, 23660, 23662, 23664, 22641, 17521, 23666, 23668, 23679, 23680, 23681, 21634, 18566, 16519, 21638, 23689, 23692, 23693, 23695, 20628, 17557, 23702, 22683, 23708, 23711, 23712, 23713, 23715, 23718, 18599, 23721, 23722, 23723, 23732, 23733, 21687, 23735, 17596, 23740, 23742, 23744, 23745, 23747, 17604, 23749, 19652, 23750, 21705, 23754, 23753, 23756, 22733, 23759, 23760, 23761, 19663, 23763, 23764, 23766, 22743, 23767, 23769, 23770, 23771, 23774, 21726, 23776, 16609, 23777, 23778, 23780, 23781, 23782, 23783, 23784, 23785, 23786, 16620, 20717, 23788, 20719, 23789, 23791, 23794, 23795, 23797, 23799, 23800, 23801, 23803, 23804, 19709, 23805, 23806, 22784, 23808, 21757, 23810, 22788, 23811, 23812, 23813, 21762, 21766, 23816,