In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
import pickle

### Load the Data

In [2]:

# Define the path to the processed data
processed_path = "../data/processed/"

# Load the processed datasets
interactions_train = pd.read_csv(os.path.join(processed_path, "interactions_train.csv"))
interactions_test = pd.read_csv(os.path.join(processed_path, "interactions_test.csv"))
user_features = pd.read_csv(os.path.join(processed_path, "user_features_engineered.csv"))
video_metadata = pd.read_csv(os.path.join(processed_path, "video_metadata.csv"))


### Collaborative Filtering using Matrix Factorization

In [3]:
# Create a user-item interaction matrix
user_item_matrix = interactions_train.pivot(index='user_id', columns='video_id', values='user_video_avg_watch_ratio').fillna(0)

# Convert the matrix to a numpy array
R = user_item_matrix.values

# Normalize by subtracting mean
user_ratings_mean = np.mean(R, axis=1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

# Perform SVD
U, sigma, Vt = svds(R_demeaned, k=50)

# Convert sigma to a diagonal matrix
sigma = np.diag(sigma)

# Predict ratings
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

# Convert to DataFrame
predicted_ratings_df = pd.DataFrame(all_user_predicted_ratings, columns=user_item_matrix.columns)


### Content-Based Filtering using TF-IDF and Cosine Similarity

In [None]:
# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Assume 'feat' column contains the video tags/categories as text
video_metadata['feat'] = video_metadata['feat'].fillna('')
print("Creating TF-IDF matrix...")
tfidf_matrix = tfidf.fit_transform(video_metadata['feat'])

# Ensure the matrix is sparsex
tfidf_matrix = csr_matrix(tfidf_matrix)
print("TF-IDF matrix shape:", tfidf_matrix.shape)
# Apply PCA for dimensionality reduction
n_components = 10  # Number of principal components to keep
pca = PCA(n_components=n_components)
tfidf_matrix = pca.fit_transform(tfidf_matrix.toarray())
print("TF-IDF matrix shape:", tfidf_matrix.shape)
# Compute the cosine similarity matrix
print("Computing cosine similarity...")
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


Creating TF-IDF matrix...
TF-IDF matrix shape: (343341, 21)
TF-IDF matrix shape: (343341, 10)
Computing cosine similarity...


### Save the Models

In [None]:
# Save the collaborative filtering model
collab_model_path = os.path.join(processed_path, "collab_filtering_model.pkl")
with open(collab_model_path, 'wb') as f:
    pickle.dump({'U': U, 'sigma': sigma, 'Vt': Vt, 'predicted_ratings_df': predicted_ratings_df}, f)

# Save the content-based filtering model
content_model_path = os.path.join(processed_path, "content_filtering_model.pkl")
with open(content_model_path, 'wb') as f:
    pickle.dump({'tfidf': tfidf, 'cosine_sim': cosine_sim}, f)

print("Models trained and saved successfully!")
