In [None]:
import os
import sys
from pathlib import Path
import ast
import random
import joblib
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix, hstack
# Setting the working directory to the root of the project
project_dir = Path("C:/Users/adbou/source/repos/KFHXRelatedAi/")
os.chdir(project_dir)

from Configs.GeneralPaths import SOURCEDATA
import pandas as pd
import numpy as np
from collections import defaultdict
pd.set_option('display.max_columns', None)

In [None]:
def load_user_profiles(cache_path):
    # Load the cached data if it exists
    user_profiles = joblib.load(cache_path)

    return user_profiles

In [None]:
#load user profiles data
user_profiles = load_user_profiles(Path(SOURCEDATA  / "cached_user_profiles.pkl"))
user_profiles.head()
user_profiles.info()


In [None]:
user_profiles.head()

In [None]:
user_transactions = pd.read_excel(Path(SOURCEDATA / "Transaction_User.xlsx"))
new_user_transaction = user_transactions.drop(columns=['TrxId'])

deals_data = pd.read_excel(Path(SOURCEDATA / "Rdepemtion_Cleaned_Deals.xlsx"))
deals_data = deals_data.drop(columns=['Unnamed: 0.1'])

deals_profiles = pd.read_excel(Path(SOURCEDATA / "Updated_Content_Profiles.xlsx"))

deals_embeddings = pd.read_csv(Path(SOURCEDATA / "Deals_Embeddings.csv"))
deals_embeddings['ada_embedding'] = deals_embeddings['ada_embedding'].apply(ast.literal_eval)

new_user_transaction = new_user_transaction.merge(deals_data[['ContentId', 'Categories','Deal Type']], left_on='FK_ContentId', right_on='ContentId', how='left')
new_user_transaction = new_user_transaction.drop(columns=['ContentId'])

# Load MCC mapping data
mcc_mapping = pd.read_excel(Path(SOURCEDATA / "MCC_Details.xlsx"))
mcc_embeddings = pd.read_csv(Path(SOURCEDATA / "Unique_MCCs_Embeddings.csv"))

# Convert MCC in mcc_mapping to string
mcc_mapping['MCC'] = mcc_mapping['MCC'].astype(str)

In [None]:
deals_profiles.head()

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Reshape data for clustering
spending_data = user_profiles[['total_amount_spent']].values.reshape(-1, 1)

# Elbow method to determine the optimal number of clusters
inertia = []
k_values = range(1, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0).fit(spending_data)
    inertia.append(kmeans.inertia_)

# Plotting the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, 'bx-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
from sklearn.cluster import KMeans

# Reshaping data for clustering
spending_data = user_profiles[['total_amount_spent']].values.reshape(-1, 1)

# Applying KMeans clustering k=3 using elbow method calculated above
kmeans = KMeans(n_clusters=3 , random_state=0).fit(spending_data)
user_profiles['spender_category'] = kmeans.labels_

# Mapping cluster labels to meaningful categories
cluster_map = {0: 'low', 1: 'medium', 2: 'high'}
user_profiles['spender_category'] = user_profiles['spender_category'].map(cluster_map)

print(user_profiles[['FK_BusinessUserId', 'total_amount_spent', 'spender_category']])

In [None]:
user_profiles.head()

In [None]:
def is_numeric_column(col_name):
    try:
        int(col_name)
        return True
    except ValueError:
        return False


mcc_columns = [col for col in user_profiles.columns if is_numeric_column(col)]

mcc_amounts_cols = [col for col in user_profiles.columns.astype(str) if col.startswith('total_amount_mcc_')]

def calculate_user_interest_score(user_id):
    user_data = user_profiles[user_profiles['FK_BusinessUserId'] == user_id]
    
    total_frequencies = user_data[mcc_columns].sum(axis=1).values[0]
    total_paid_amount = user_data[mcc_amounts_cols].sum(axis=1).values[0]
    
    mcc_scores = {}
    if total_frequencies > 0 and total_paid_amount > 0:
        for mcc in mcc_columns:
            frequency = user_data[mcc].values[0]
            amount_col = f'total_amount_mcc_{mcc}'
            if amount_col in user_data.columns:
                amount = user_data[amount_col].values[0]
                if frequency > 0 and amount > 0:
                    score = (frequency / total_frequencies) * (amount / total_paid_amount)
                    mcc_scores[mcc] = score
    
    return mcc_scores

In [None]:
def convert_to_array(x):
    if isinstance(x, str):  # If x is a string, try to evaluate it
        try:
            return np.array(ast.literal_eval(x))
        except (ValueError, SyntaxError):
            raise ValueError(f"Cannot convert to array: {x}")
    elif isinstance(x, (list, np.ndarray)):  # If x is already a list or array, convert to np.array
        return np.array(x)
    else:
        raise ValueError(f"Unexpected format: {x}")

In [None]:
def get_top_n_mccs(user_profiles , n=2):
    #Identifying MCC columns for amount and frequency in user_profiles
    mcc_amount_columns = [col for col in user_profiles.columns if str(col).startswith('total_amount_mcc_')]
    mcc_frequency_columns = [col for col in user_profiles.columns if str(col).isdigit()]

    # Normalizing the frequency and amount of MCCs
    mcc_amount_sums = user_profiles[mcc_amount_columns].sum()
    mcc_frequency_sums = user_profiles[mcc_frequency_columns].sum()

    normalized_amount = mcc_amount_sums / mcc_amount_sums.sum()
    normalized_frequency = mcc_frequency_sums / mcc_frequency_sums.sum()

    # Combining the metrics (we multiply with equal weight for simplicity)
    combined_score = 0.5 * normalized_amount.values + 0.5 * normalized_frequency.values 

    #Rank and selecting the top 10 MCCs
    combined_score_series = pd.Series(combined_score, index=mcc_frequency_columns)
    top_10_mccs = combined_score_series.sort_values(ascending=False).head(n)

    top_mccs_df = top_10_mccs.reset_index()
    top_mccs_df.columns = ['MCC', 'Score']
    top_mccs_df['MCC'] = top_mccs_df['MCC'].astype(str)
    top_mccs_with_details = pd.merge(top_mccs_df, mcc_mapping, on='MCC', how='left')
    top_mccs_with_details
    return top_mccs_with_details

top_mccs = get_top_n_mccs(user_profiles , 2)

In [None]:
def recommend_based_on_profiles(user_id, deal_embeddings, deal_data, user_profiles,n_similar_items = 10 , isDf = False):
    # Implementing a recommendation strategy based on user profiles alone
    user_profile = user_profiles[user_profiles['FK_BusinessUserId'] == user_id]
    spender_category = user_profiles.loc[user_profiles['FK_BusinessUserId'] == user_id, 'spender_category'].values[0]
    print(f"Spender Category {spender_category}")

    #calculating mcc_scores
    mcc_scores = calculate_user_interest_score(user_id)

    # Merging the MCC scores with MCC mapping
    mcc_scores_df = pd.DataFrame.from_dict(mcc_scores, orient='index', columns=['Score'])
    mcc_scores_df.reset_index(inplace=True)
    mcc_scores_df.columns = ['MCC', 'Score']
    mcc_scores_df['MCC'] = mcc_scores_df['MCC'].astype(str)  # Convert MCC to string
    mcc_scores_df = mcc_scores_df.merge(mcc_mapping, on='MCC', how='left')
    mcc_scores_df = mcc_scores_df.sort_values(by='Score', ascending=False)

    # Creating labels combining MCC and description
    mcc_scores_df['Label'] = mcc_scores_df['MCC'] + ' - ' + mcc_scores_df['Detailed MCC']

    spender_ranges = {
        'low': ['Low-Budget Deal'],
        'medium': ['Medium Budget Deal','Low-Budget Deal'],
        'high': ['High-End Deal']
    }

    recommendations = []

    #user mccs embeddings
    mcc_embeddings['ada_embedding'] = mcc_embeddings['ada_embedding'].apply(convert_to_array)

    # Creating the embeddings dictionary
    mcc_embedding_dict = mcc_embeddings.set_index('MCC')['ada_embedding'].to_dict()

    # Filtering and collecting the embeddings that matches the user's MCC scores
    matched_embeddings = [mcc_embedding_dict[mcc] for mcc in mcc_scores.keys() if mcc in mcc_embedding_dict]


    user_embedding = np.mean(matched_embeddings, axis=0)


    # Getting top MCC scores
    mcc_scores_df = mcc_scores_df.sort_values(by='Score', ascending=False)

    user_mcc_scores = mcc_scores_df.set_index('Detailed MCC')['Score'].to_dict()

    
    for index, row in deal_data.iterrows():
        content_id = row['ContentId']
        item_mcc = row['Categories']
        deal_segment = deals_profiles.loc[deals_profiles['FK_ContentId'] == content_id, 'Deal Value Segment'].values[0]
        deal_embedding = np.array(deal_embeddings.loc[deal_embeddings['ContentId'] == content_id, 'ada_embedding'].values[0])
        score = cosine_similarity([user_embedding], [deal_embedding])[0][0]
        
        if item_mcc in top_mccs["Detailed MCC"].tolist():
            score *= 0.5

        # Adjusting the score based on MCC interest scores of the user
        if item_mcc in user_mcc_scores:
            score *= (1.2 + user_mcc_scores[item_mcc])
        else:
            score *= 0.8
        
        # Adding weighted adjustment based on spender category
        if spender_category in spender_ranges and deal_segment in spender_ranges[spender_category]:
            score *= 1.5

        # recency = 1 / (1 + user_profile['recency'].values[0])
        # score *= (1 + 0.3 * recency)

        recommendations.append((content_id, score, row['Categories']))

    # Sorting recommendations by score
    sorted_recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    
    #Enforcing Diversity

    final_recommendations = []

    category_count = defaultdict(int)

    max_per_category = 2

    if(len(user_mcc_scores) < 4):
        max_per_category = 8


    for content_id, score, category in sorted_recommendations:
        if category_count[category] >= max_per_category:
            score *= 0.5  # Apply a penalty, could be adjusted dynamically
        if category_count[category] < max_per_category:
            final_recommendations.append((content_id, score))
            category_count[category] += 1

        if len(final_recommendations) >= n_similar_items:
            break
    
    if isDf:
        similar_item_ids = [item for item, score, category in final_recommendations]
        similar_item_scores = [score for item, score, category in final_recommendations]
        categories = [category for item, score, category in final_recommendations]
        recommendations_df = pd.DataFrame({
            'ContentId': similar_item_ids,
            'Score': similar_item_scores,
            'Category': categories
        })
        return recommendations_df
    else:    
        similar_item_ids = [item for item, score in final_recommendations]
        similar_item_scores = [score for item, score in final_recommendations]

    return similar_item_ids, similar_item_scores

In [None]:
user_id = 978282
recommendations = recommend_based_on_profiles(user_id, deals_embeddings, deals_data, user_profiles, 20, False)
for content_id, score in zip(*recommendations):
  print(f"Content-Based Recommended deal: {content_id}, Score: {score}, Category: {deals_data.loc[deals_data['ContentId'] == content_id, 'Categories'].values[0]}")

In [None]:
specific_content_id = 113896,
result = deals_data[deals_data['ContentId'] == specific_content_id]

result

In [None]:
mcc_scores = calculate_user_interest_score(user_id)

# Merging the MCC scores with MCC mapping
mcc_scores_df = pd.DataFrame.from_dict(mcc_scores, orient='index', columns=['Score'])
mcc_scores_df.reset_index(inplace=True)
mcc_scores_df.columns = ['MCC', 'Score']
mcc_scores_df['MCC'] = mcc_scores_df['MCC'].astype(str)  # Convert MCC to string
mcc_scores_df = mcc_scores_df.merge(mcc_mapping, on='MCC', how='left')

# Creating labels combining MCC and description
mcc_scores_df['Label'] = mcc_scores_df['MCC'] + ' - ' + mcc_scores_df['Detailed MCC']
mcc_scores_df = mcc_scores_df.sort_values(by='Score', ascending=False)

mcc_scores_df