In [1]:
import pandas as pd

# Load CSV files
viewed_posts = pd.read_csv("viewed_posts.csv")
liked_posts = pd.read_csv("liked_posts.csv")
inspired_posts = pd.read_csv("inspired_posts.csv")
rated_posts = pd.read_csv("rated_posts.csv")
all_posts = pd.read_csv("all_posts.csv")
all_users = pd.read_csv("all_users.csv")



In [2]:

# Inspect data
print(viewed_posts.info())
print(liked_posts.info())
print(rated_posts.info())
print(viewed_posts.info())
print(inspired_posts.info())
print(all_users.info())
print(all_posts.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6380 entries, 0 to 6379
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         6380 non-null   int64 
 1   post_id    6380 non-null   int64 
 2   user_id    6380 non-null   int64 
 3   viewed_at  6380 non-null   object
dtypes: int64(3), object(1)
memory usage: 199.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        1259 non-null   int64 
 1   post_id   1259 non-null   int64 
 2   user_id   1259 non-null   int64 
 3   liked_at  1259 non-null   object
dtypes: int64(3), object(1)
memory usage: 39.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2877 entries, 0 to 2876
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id            

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert date columns to datetime
viewed_posts['viewed_at'] = pd.to_datetime(viewed_posts['viewed_at'])
liked_posts['liked_at'] = pd.to_datetime(liked_posts['liked_at'])
rated_posts['rated_at'] = pd.to_datetime(rated_posts['rated_at'])
inspired_posts['inspired_at'] = pd.to_datetime(inspired_posts['inspired_at'])

# Combine interaction datasets
interactions = pd.concat([
    viewed_posts.assign(interaction_type='viewed'),
    liked_posts.assign(interaction_type='liked'),
    rated_posts.assign(interaction_type='rated'),
    inspired_posts.assign(interaction_type='inspired')
])

# Merge with post metadata
interactions = interactions.merge(all_posts, left_on='post_id', right_on='id', how='left')

# Drop irrelevant columns
interactions = interactions[['user_id', 'post_id', 'interaction_type', 'rating_percent', 'average_rating', 'category']]

# Handle missing values
interactions['rating_percent'].fillna(0, inplace=True)
interactions['average_rating'].fillna(interactions['average_rating'].mean(), inplace=True)

# Normalize ratings
interactions['normalized_rating'] = interactions['rating_percent'] / 100


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  interactions['rating_percent'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  interactions['average_rating'].fillna(interactions['average_rating'].mean(), inplace=True)


In [4]:
# Combine relevant text features for posts
all_posts['text_features'] = all_posts['title'].fillna('') + " " + all_posts['category']

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(all_posts['text_features'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a reverse lookup for post IDs
post_indices = pd.Series(all_posts.index, index=all_posts['id'])


In [5]:
def recommend_posts(post_id, num_recommendations=5):
    # Get index of the given post ID
    idx = post_indices[post_id]

    # Fetch similarity scores for the post
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Fetch top N similar posts
    sim_indices = [i[0] for i in sim_scores[1:num_recommendations+1]]
    return all_posts.iloc[sim_indices][['id', 'title', 'category']]


In [6]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

# Prepare data for collaborative filtering
reader = Reader(rating_scale=(0, 1))  # Assuming normalized ratings
cf_data = Dataset.load_from_df(interactions[['user_id', 'post_id', 'normalized_rating']], reader)

# Build collaborative filtering model
algo = SVD()
cross_validate(algo, cf_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1757  0.1714  0.1756  0.1774  0.1778  0.1756  0.0023  
MAE (testset)     0.1116  0.1122  0.1135  0.1123  0.1129  0.1125  0.0007  
Fit time          0.12    0.12    0.13    0.13    0.14    0.13    0.00    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    


{'test_rmse': array([0.17573661, 0.1714018 , 0.1756331 , 0.1773955 , 0.17775882]),
 'test_mae': array([0.11161238, 0.11222905, 0.11353719, 0.11234276, 0.11294038]),
 'fit_time': (0.12497878074645996,
  0.12499523162841797,
  0.12501907348632812,
  0.12500977516174316,
  0.13549566268920898),
 'test_time': (0.023152589797973633,
  0.01563286781311035,
  0.01562976837158203,
  0.015634775161743164,
  0.015624046325683594)}

In [10]:
from surprise import Dataset, Reader, SVD, KNNBasic, NMF
from surprise.model_selection import GridSearchCV, cross_validate
import pandas as pd

# Prepare data for collaborative filtering
reader = Reader(rating_scale=(0, 1))  # Assuming normalized ratings
cf_data = Dataset.load_from_df(interactions[['user_id', 'post_id', 'normalized_rating']], reader)

# --- 1. Hyperparameter Tuning for SVD ---
# Define parameter grid for tuning SVD hyperparameters
param_grid = {
    'n_factors': [50, 100, 150],  # Number of latent factors
    'n_epochs': [5, 10, 20],  # Number of epochs for training
    'lr_all': [0.002, 0.005, 0.01],  # Learning rate
    'reg_all': [0.2, 0.4, 0.6]  # Regularization term
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(SVD, param_grid, measures=['RMSE', 'MAE'], cv=5)
grid_search.fit(cf_data)

# Correctly access the best parameters and scores
print("Best Parameters for SVD (RMSE):", grid_search.best_params['rmse'])
print("Best RMSE for SVD:", grid_search.best_score['rmse'])

print("Best Parameters for SVD (MAE):", grid_search.best_params['mae'])
print("Best MAE for SVD:", grid_search.best_score['mae'])

# --- 2. Model Comparison ---
# Define models for comparison
models = {
    'SVD': SVD(n_factors=grid_search.best_params['rmse']['n_factors'], 
               n_epochs=grid_search.best_params['rmse']['n_epochs'], 
               lr_all=grid_search.best_params['rmse']['lr_all'], 
               reg_all=grid_search.best_params['rmse']['reg_all']),
    'KNNBasic': KNNBasic(),
    'NMF': NMF()
}

# Evaluate each model using cross-validation
for model_name, model in models.items():
    print(f"\nEvaluating {model_name}...")
    results = cross_validate(model, cf_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    print(f"Mean RMSE for {model_name}: {results['test_rmse'].mean()}")
    print(f"Mean MAE for {model_name}: {results['test_mae'].mean()}")


Best Parameters for SVD (RMSE): {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4}
Best RMSE for SVD: 0.1633905542316305
Best Parameters for SVD (MAE): {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.2}
Best MAE for SVD: 0.10031156183482823

Evaluating SVD...
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1696  0.1683  0.1573  0.1581  0.1659  0.1639  0.0052  
MAE (testset)     0.1040  0.1026  0.0992  0.0995  0.1025  0.1016  0.0019  
Fit time          0.08    0.06    0.09    0.08    0.09    0.08    0.01    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Mean RMSE for SVD: 0.1638516434476464
Mean MAE for SVD: 0.1015851116143042

Evaluating KNNBasic...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix

- Best RMSE: SVD (0.1637) outperformed both KNNBasic (0.1835) and NMF (0.1730).
- Best MAE: NMF (0.0884) outperformed SVD (0.1015) and KNNBasic (0.1109).
- SVD has a good balance of RMSE and MAE, making it a strong choice for recommendation systems

# Mean RMSE for SVD: 0.1636347943787098
# Mean MAE for SVD: 0.10135708789520166