In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from src.models.content_handler_v1 import ContentHandler
from sklearn.decomposition import PCA
from src.pipeline.data_processor import DataProcessor, CleanedData
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
dp = DataProcessor()
bookmarks_df = dp.load_table(CleanedData.BOOKMARKS)
cat_df = dp.load_table(CleanedData.CATEGORIES)
shiur_df = dp.load_table(CleanedData.SHIURIM)

In [None]:
df_played = bookmarks_df[(bookmarks_df['played'] == 1) | (bookmarks_df['bookmark'] == 'queue')]
user_grouped = df_played.groupby('user')
unique_listens_per_user = df_played.groupby('user')['shiur'].nunique().reset_index()
unique_listens_per_user['shiur'].describe()

In [None]:
unique_listens_per_user.columns = ['user', 'unique_listens']
# Calculate IQR and filter out outliers
Q1 = unique_listens_per_user['unique_listens'].quantile(0.00)
Q3 = unique_listens_per_user['unique_listens'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
upper_bound

In [None]:
majority_listeners = unique_listens_per_user[(unique_listens_per_user['unique_listens'] <= upper_bound)]
top_listeners = unique_listens_per_user[(unique_listens_per_user['unique_listens'] > upper_bound)]

majority_listeners_list = majority_listeners['user'].unique()
top_listeners_list = top_listeners['user'].unique()

In [None]:
ch = ContentHandler()
user_embeddings_df = ch.get_user_embedding()


In [None]:
majority_user_embeddings_df = user_embeddings_df[user_embeddings_df['user'].isin(majority_listeners_list)].copy()
top_users_embeddings_df = user_embeddings_df[user_embeddings_df['user'].isin(top_listeners_list)].copy()

In [None]:
def generate_shiur_embeddings(shiur_df):
    embeddings = []
    for index, row in shiur_df.iterrows():
        details = row['full_details']
        embedding = ch.get_title_vector(details)
        embeddings.append(embedding)
    shiur_df['embedding'] = embeddings
    return shiur_df

all_shiur_embeddings = generate_shiur_embeddings(shiur_df)

In [None]:
average_length = bookmarks_df.groupby('user')['duration'].mean().reset_index()
average_length.columns = ['user', 'average_length']

In [None]:
total_listens = bookmarks_df[bookmarks_df['played'] ==  1].groupby('user').size().reset_index(name='total_listens')

In [None]:
total_listens.head()

In [None]:
# Merge bookmarks_df with cat_df to get the categories for each listened shiur
user_categories = bookmarks_df[['user', 'shiur']].merge(cat_df, on='shiur')

user_categories.drop(columns=['shiur'],inplace=True)
# Calculate the mean of one-hot encoded categories for each user

In [None]:
category_preferences = user_categories.groupby('user').sum()

In [None]:
top_n = 5
top_categories = category_preferences.apply(lambda x: x.nlargest(top_n).index.tolist(), axis=1).reset_index()
top_categories.columns = ['user', 'top_categories']

In [None]:
top_categories_exploded = top_categories.explode('top_categories')

# One-hot encode the top categories
category_counts = top_categories_exploded['top_categories'].value_counts().to_dict()
top_categories_exploded['top_categories_encoded'] = top_categories_exploded['top_categories'].map(category_counts)

# Aggregate the encoded values for each user (e.g., sum or mean)
top_category_features = top_categories_exploded.groupby('user')['top_categories_encoded'].mean().reset_index()

In [None]:
from scipy.stats import entropy

def calculate_entropy(row):
    # Avoid calculating entropy on all-zero rows
    row_non_zero = row[row > 0]
    return entropy(row_non_zero)

# Apply entropy calculation to each row (excluding the 'user' column)
category_preferences = category_preferences.div(category_preferences.sum(axis=1), axis=0)
diversity_df = category_preferences.apply(calculate_entropy, axis=1).reset_index()
diversity_df.columns = ['user', 'diversity']

In [None]:
def plot_all_users_diversity(diversity_df):
    plt.figure(figsize=(10, 6))
    plt.hist(diversity_df['diversity'], bins=30, edgecolor='k', alpha=0.7)
    plt.title('Distribution of Diversity (Entropy) Scores for All Users')
    plt.xlabel('Diversity (Entropy)')
    plt.ylabel('Number of Users')
    plt.show()

# Example usage
plot_all_users_diversity(diversity_df)

In [None]:
new_features_df = majority_user_embeddings_df.copy()
new_features_df = new_features_df.merge(average_length, on='user', how='left')
new_features_df = new_features_df.merge(top_category_features, on='user', how='left')
new_features_df = new_features_df.merge(diversity_df, on='user', how='left')
new_features_df.fillna(0, inplace=True)

new_features_df

In [None]:
from sklearn.preprocessing import StandardScaler

embedding_pca = PCA(n_components=5)  # Adjust the number of components as needed
embeddings = embedding_pca.fit_transform(np.stack(new_features_df['embedding'].values))
additional_features = new_features_df.drop(columns=['user', 'embedding'])
scaler = StandardScaler()
normalized_features = scaler.fit_transform(additional_features)
X = np.hstack((embeddings, normalized_features))

In [None]:
embeddings.shape

In [None]:
normalized_features.shape

In [None]:
new_features_df.drop(columns=['Cluster'],inplace=True)

In [None]:
cluster_number = 30
kmeans = KMeans(n_clusters=cluster_number, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_
new_features_df.loc[:, 'Cluster'] = labels

In [None]:
def get_cluster_recommendations(cluster_label, user_embeddings_df, shiur_df, top_n=500):
    cluster_embeddings = np.vstack(user_embeddings_df[user_embeddings_df['Cluster'] == cluster_label]['embedding'].values)
    cluster_avg_embedding = np.mean(cluster_embeddings, axis=0).reshape(1, -1)
    shiur_ids = shiur_df['shiur'].values
    shiur_embeddings = np.vstack(shiur_df['embedding'].values)
    similarities = cosine_similarity(cluster_avg_embedding, shiur_embeddings).flatten()
    top_similar_indices = similarities.argsort()[-top_n:][::-1]
    top_similar_shiurim = [shiur_ids[i] for i in top_similar_indices]
    return top_similar_shiurim

In [None]:
def fine_tune_recommendations(user_id, user_embeddings_df, cluster_recommendations, shiur_df, top_n=5):
    user_embedding = np.array(user_embeddings_df[user_embeddings_df['user'] == user_id]['embedding'].values[0]).reshape(1, -1)
    recommended_shiur_embeddings = np.array([shiur_df[shiur_df['shiur'] == shiur]['embedding'].values[0] for shiur in cluster_recommendations])
    similarities = cosine_similarity(user_embedding, recommended_shiur_embeddings).flatten()
    top_similar_indices = similarities.argsort()[-top_n:][::-1]
    fine_tuned_recommendations = [cluster_recommendations[i] for i in top_similar_indices]
    return fine_tuned_recommendations

In [None]:
def get_final_recommendations(user_id, user_embeddings_df, shiur_df, top_n=5):
    # Check if the user_id exists in the user_embeddings_df
    if user_id not in user_embeddings_df['user'].values:
        raise ValueError(f"User ID {user_id} not found in user_embeddings_df")
    
    cluster_label = user_embeddings_df[user_embeddings_df['user'] == user_id]['Cluster'].values[0]
    cluster_recommendations = get_cluster_recommendations(cluster_label, user_embeddings_df, shiur_df, top_n*2)  # Get more to fine-tune
    fine_tuned_recommendations = fine_tune_recommendations(user_id, user_embeddings_df, cluster_recommendations, shiur_df, top_n)
    final_recommendations = {shiur_id: shiur_df[shiur_df['shiur'] == shiur_id]['full_details'].values[0] for shiur_id in fine_tuned_recommendations}
    
    return final_recommendations


In [None]:
info_df = bookmarks_df.merge(shiur_df, on='shiur')
info_df = info_df[['user','shiur','title', 'full_details']]

In [None]:
def get_user_shiurs(user_id):
    return info_df[info_df['user'] == user_id]['full_details'].values

In [None]:
user_id = 224576

In [None]:
get_final_recommendations(user_id, new_features_df, all_shiur_embeddings)

In [None]:
get_user_shiurs(user_id)

In [None]:
cluster_category_counts = pd.DataFrame(columns=['Cluster'] + list(cat_df.columns[1:]))
cluster_category_counts['Cluster'] = range(cluster_number)

def get_category_breakdown_for_cluster(cluster_id):
    cluster_users = new_features_df[new_features_df['Cluster'] == cluster_id]['user']
    cluster_shiurs = bookmarks_df[bookmarks_df['user'].isin(cluster_users)]['shiur']
    cluster_categories = cat_df[cat_df['shiur'].isin(cluster_shiurs)]
    category_counts = cluster_categories.drop(columns='shiur').sum().to_dict()
    return category_counts

# Fill the cluster category counts DataFrame
for i in range(cluster_number):
    category_counts = get_category_breakdown_for_cluster(i)
    for category, count in category_counts.items():
        cluster_category_counts.at[i, category] = count

# Fill NaN values with 0 (if any)
cluster_category_counts.fillna(0, inplace=True)

# Convert all columns to numeric to avoid issues with nlargest
for col in cluster_category_counts.columns[1:]:
    cluster_category_counts[col] = pd.to_numeric(cluster_category_counts[col], errors='coerce')

# Save to CSV
cluster_category_counts.to_csv("results.csv")

# Find top 5 categories for each cluster
top_categories = {}

for cluster in cluster_category_counts.index:
    top_categories[cluster] = cluster_category_counts.loc[cluster, cluster_category_counts.columns != 'Cluster'].nlargest(5)

# Convert to DataFrame for easier plotting
top_categories_df = pd.DataFrame(top_categories).T

# Plotting
sns.set(style="whitegrid")

num_clusters = len(top_categories_df)
cols = 3
rows = (num_clusters // cols) + (num_clusters % cols > 0)

fig, axes = plt.subplots(rows, cols, figsize=(15, 20), constrained_layout=True)

for i, cluster in enumerate(top_categories_df.index):
    row = i // cols
    col = i % cols
    ax = axes[row, col]
    
    top_categories_cluster = top_categories_df.loc[cluster]
    top_categories_cluster = top_categories_cluster.sort_values(ascending=False)
    
    sns.barplot(x=top_categories_cluster.values[:5], y=top_categories_cluster.index[:5], ax=ax, hue=top_categories_cluster.index[:5], palette="viridis", legend=False)
    num_users = new_features_df[new_features_df['Cluster'] == i].count().values[0]
    ax.set_title(f'Cluster {cluster} - {num_users} Users')
    ax.set_xlabel('Count')
    ax.set_ylabel('Category')

# Remove any empty subplots
for j in range(i + 1, rows * cols):
    fig.delaxes(axes.flatten()[j])

plt.suptitle('Top 5 Categories by Cluster', fontsize=16)
plt.show()

In [None]:
from sklearn.manifold import TSNE
# Extract embeddings and cluster labels
embeddings = np.stack(new_features_df['embedding'].values)
cluster_labels = new_features_df['Cluster'].values

# Perform TSNE
tsne = TSNE(n_components=2, random_state=42)
tsne_embeddings = tsne.fit_transform(embeddings)

def plot_tsne_clusters(tsne_embeddings, cluster_labels, clusters_to_plot, cluster_category_counts, title='TSNE Visualization'):
    plt.figure(figsize=(12, 10))
    mask = np.isin(cluster_labels, clusters_to_plot)
    subset_embeddings = tsne_embeddings[mask]
    subset_labels = cluster_labels[mask]

    scatter = plt.scatter(subset_embeddings[:, 0], subset_embeddings[:, 1], c=subset_labels, cmap='viridis', s=50)
    plt.legend(handles=scatter.legend_elements()[0], labels=[f'Cluster {i}' for i in clusters_to_plot], title="Clusters")

    # Adding text labels for each cluster based on the mean position of their embeddings
    for cluster_id in clusters_to_plot:
        cluster_mask = subset_labels == cluster_id
        cluster_mean = subset_embeddings[cluster_mask].mean(axis=0)
        categories = cluster_category_counts.loc[cluster_id].nlargest(1).index[0]
        plt.text(cluster_mean[0], cluster_mean[1], f'Cluster {cluster_id}\nTop: {categories}', fontsize=9, ha='center')

    plt.colorbar(scatter, label='Cluster')
    plt.title(title)
    plt.xlabel('TSNE Dimension 1')
    plt.ylabel('TSNE Dimension 2')
    plt.show()

# Number of clusters
num_clusters = len(np.unique(cluster_labels))

# Number of clusters to visualize at a time
clusters_per_plot = 5

for i in range(0, num_clusters, clusters_per_plot):
    clusters_to_plot = range(i, min(i + clusters_per_plot, num_clusters))
    plot_tsne_clusters(tsne_embeddings, cluster_labels, clusters_to_plot, cluster_category_counts, title=f'TSNE Visualization for Clusters {i}-{i+clusters_per_plot-1}')
