In [None]:
import os
import sys
from pathlib import Path
# Setting the working directory to the root of the project
project_dir = Path("C:/Users/adbou/source/repos/KFHXRelatedAi/")
os.chdir(project_dir)

from Configs.GeneralPaths import SOURCEDATA
import pandas as pd
import numpy as np

In [None]:
#load user transactions data
user_transactions = pd.read_excel(Path(SOURCEDATA / "Transaction_User.xlsx"))
new_user_tranaction = user_transactions.drop(columns=['TrxId'])
new_user_tranaction.head()
new_user_tranaction.info()

In [None]:
#load deals data
deals_data = pd.read_excel(Path(SOURCEDATA / "Cleaned_Deals.xlsx"))
deals_data = deals_data.drop(columns=['Unnamed: 0'])
deals_data.head()
deals_data.info()

In [None]:
#merging deals data with user transaction data :
new_user_tranaction = new_user_tranaction.merge(deals_data[['ContentId', 'Categories']], left_on='FK_ContentId', right_on='ContentId', how='left')
new_user_tranaction = new_user_tranaction.drop(columns=['ContentId'])
new_user_tranaction

In [None]:
# Count the number of interactions per user
user_interaction_counts = new_user_tranaction['FK_BusinessUserId'].value_counts()

# Summary statistics
print(user_interaction_counts.describe())

# Plot the distribution
import matplotlib.pyplot as plt

plt.figure(figsize=(5, 5))
plt.hist(user_interaction_counts, bins=50, edgecolor='k')
plt.xlabel('Number of Interactions per User')
plt.ylabel('Count of Users')
plt.title('Distribution of Interactions per User')
plt.show()

In [None]:
user_redemption_counts = new_user_tranaction['FK_BusinessUserId'].value_counts()
user_redemption_counts
#1012893

In [None]:
users_with_3_plus_redemptions = user_redemption_counts[user_redemption_counts >= 3].index
filtered_user_transactions = new_user_tranaction[new_user_tranaction['FK_BusinessUserId'].isin(users_with_3_plus_redemptions)]
filtered_user_transactions

In [None]:
user_item_matrix = pd.pivot_table(
    new_user_tranaction,
    index='FK_BusinessUserId',
    columns='FK_ContentId',
    values='PointsRedeemed',
    fill_value=0,
    aggfunc='mean'  
).fillna(0)

In [None]:
user_item_matrix

In [None]:
# Descriptive statistics
print(new_user_tranaction.describe())

# Checking for missing values
print(new_user_tranaction.isnull().sum())

# Checking the unique values
print(new_user_tranaction.nunique())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualizing the distribution of PointsRedeemed
sns.histplot(new_user_tranaction['PointsRedeemed'], bins=50, kde=True)
plt.title('Distribution of PointsRedeemed')
plt.show()

# Visualizing the distribution of redemptions per user
user_redemptions = new_user_tranaction.groupby('FK_BusinessUserId').size()
sns.histplot(user_redemptions, bins=50, kde=True)
plt.title('Distribution of Redemptions per User')
plt.show()

# Visualizing the distribution of redemptions per item
item_redemptions = new_user_tranaction.groupby('FK_ContentId').size()
sns.histplot(item_redemptions, bins=50, kde=True)
plt.title('Distribution of Redemptions per Item')
plt.show()


In [None]:
# Create the user-item matrix
user_item_matrix = new_user_tranaction.pivot_table(index='FK_BusinessUserId', columns='FK_ContentId', values='PointsRedeemed', fill_value=0)

# Calculate sparsity
sparsity = 1.0 - (np.count_nonzero(user_item_matrix) / float(user_item_matrix.size))
print(f"Sparsity of the user-item matrix: {sparsity:.4f}")


In [None]:
# Category distribution
category_counts = new_user_tranaction['Categories'].value_counts()
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Category Distribution')
plt.xticks(rotation=90)
plt.show()


In [None]:
from sklearn.decomposition import PCA

# Fill missing values with 0
user_item_matrix_filled = user_item_matrix.fillna(0)

# Apply PCA
pca = PCA(n_components=2)
user_item_pca = pca.fit_transform(user_item_matrix_filled)

# Plot PCA
plt.scatter(user_item_pca[:, 0], user_item_pca[:, 1], alpha=0.5)
plt.title('PCA of User-Item Matrix')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


In [None]:
# Getting the most redeemed FK_ContentId
most_redeemed_content = new_user_tranaction['FK_ContentId'].value_counts().idxmax()
most_redeemed_count = new_user_tranaction['FK_ContentId'].value_counts().max()

print(f"The most redeemed FK_ContentId is {most_redeemed_content} with {most_redeemed_count} redemptions.")


In [None]:
# Getting the top 10 users with the most redemptions
top_n = 30 
top_redemptions_users = new_user_tranaction['FK_BusinessUserId'].value_counts().head(top_n)

print(f"The top {top_n} users with the most redemptions are:\n{top_redemptions_users}")


In [None]:

content_id_to_check = 115099
filtered_content_data = new_user_tranaction[new_user_tranaction['FK_ContentId'] == content_id_to_check]
user_redemptions = filtered_content_data.groupby('FK_BusinessUserId').size()
users_with_more_than_5_redemptions = user_redemptions[user_redemptions > 5]

print(users_with_more_than_5_redemptions)
