Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import json

Data preprocessing part

In [None]:
# Read CSV file
data = pd.read_csv("C:/Users/Dell/Desktop/E-commerce03.csv")

# Fill missing values with median for numerical columns
data['Annual Income'] = data['Annual Income'].fillna(data['Annual Income'].median())
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Time on Site'] = data['Time on Site'].fillna(data['Time on Site'].median())

# Fill missing values with mode for categorical column
data['Location'] = data['Location'].fillna(data['Location'].mode()[0])

# Filter users within reasonable age range
data = data[(data['Age'] >= 18) & (data['Age'] <= 100)]

# Bin Annual Income into Income Tier
data['Income Tier'] = pd.cut(data['Annual Income'], bins=[0, 50000, 80000, 110000, np.inf],
                             labels=['Low', 'Medium', 'High', 'Very High'])

# Define region mapping
region_mapping = {
    'City P': 'Region 1', 'City N': 'Region 2', 'City V': 'Region 3', 'City W': 'Region 4',
    'City Q': 'Region 6', 'City F': 'Region 7', 'City B': 'Region 8',
    'City A': 'Region 1', 'City T': 'Region 2', 'City U': 'Region 3', 'City Z': 'Region 4',
    'City X': 'Region 5', 'City Y': 'Region 6', 'City L': 'Region 7', 'City M': 'Region 8',
    'City C': 'Region 1', 'City D': 'Region 2', 'City S': 'Region 3', 'City J': 'Region 4',
    'City O': 'Region 5', 'City E': 'Region 8'
}

# Map city locations to regions
data['Region'] = data['Location'].map(region_mapping)

# Bin Time on Site into Time on Site Tier
data['Time on Site Tier'] = pd.cut(data['Time on Site'], bins=[0, 100, 200, 300, np.inf],
                                   labels=['Short', 'Medium', 'Long', 'Very Long'])

# Save processed data to new CSV file
try:
    data.to_csv("C:/Users/Dell/Desktop/E-commerce04.csv", index=False)
    print("Data successfully saved to file.")
except PermissionError:
    print("Permission denied to write file. Please ensure the file is not being used by another program and you have sufficient permissions.")
except Exception as e:
    print(f"An error occurred while saving the file: {e}")

KNN Recommendation part -Data preparation

In [None]:
# Read data
data = pd.read_csv("C:/Users/Dell/Desktop/E-commerce04.csv")

# Extract user IDs and purchase histories
user_ids = data['Customer ID'].tolist()
purchase_history = data['Purchase History'].tolist()

# Collect all product categories
all_product_categories = set()
for history in purchase_history:
    purchases = eval(history)
    for purchase in purchases:
        all_product_categories.add(purchase['Product Category'])

# Create product-user interaction matrix
interaction_matrix = pd.DataFrame(index=user_ids, columns=list(all_product_categories), dtype=float)
interaction_matrix = interaction_matrix.fillna(0)

# Fill the matrix elements with ratings
for i, user_id in enumerate(user_ids):
    purchases = eval(purchase_history[i])
    for purchase in purchases:
        category = purchase['Product Category']
        rating = purchase['Product Review']['Rating']
        if category in interaction_matrix.columns:
            interaction_matrix.loc[user_id, category] = rating

# Split into training set and validation set
train_data, validation_data = train_test_split(data, test_size=0.3, random_state=42)

# Randomly remove 50% of the purchase records for validation set users
for index, row in validation_data.iterrows():
    purchases = eval(row['Purchase History'])
    if purchases:  # Ensure purchase records are not empty
        # Randomly remove 50% of purchase records
        purchases_to_keep = np.random.choice(purchases, size=int(len(purchases) * 0.5), replace=False)
        validation_data.at[index, 'Purchase History'] = str(purchases_to_keep.tolist())

KNN Recommendation part - Define functions

In [None]:
# Use KNN algorithm to find similar users
def find_similar_users(user_id, interaction_matrix, n_neighbors=5):
    # Extract the rating data of the user from the interaction matrix
    user_ratings = interaction_matrix.loc[user_id].values.reshape(1, -1)

    # Use KNN algorithm to find similar users
    knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
    knn.fit(interaction_matrix.fillna(0).values)

    distances, indices = knn.kneighbors(user_ratings)

    # Get the indices of similar users
    similar_user_indices = indices[0][1:]  # Exclude the user themselves

    # Map indices back to user IDs
    similar_users = [interaction_matrix.index[i] for i in similar_user_indices]

    return similar_users

# Generate recommendation list for validation set users
def generate_recommendations(validation_data, interaction_matrix, top_n=3):
    recommendations = {}
    for user_id in validation_data['Customer ID']:
        similar_users = find_similar_users(user_id, interaction_matrix)

        # Collect purchase records of similar users
        recommended_items = {}
        for similar_user in similar_users:
            if similar_user in interaction_matrix.index:
                similar_user_ratings = interaction_matrix.loc[similar_user].dropna()
                for item in similar_user_ratings.index:
                    if pd.notna(similar_user_ratings[item]):
                        if item in recommended_items:
                            recommended_items[item] += 1
                        else:
                            recommended_items[item] = 1

        # Get items the user has already purchased (after removal of purchase records)
        purchases = eval(validation_data[validation_data['Customer ID'] == user_id]['Purchase History'].values[0])
        purchased_items = set(purchase['Product Category'] for purchase in purchases)

        # Remove items the user has already purchased
        recommended_items = {item: count for item, count in recommended_items.items() if item not in purchased_items}

        # Sort recommended items by frequency in descending order
        sorted_recommended_items = sorted(recommended_items.items(), key=lambda x: x[1], reverse=True)

        # Extract item names and limit the size of the recommendation list
        sorted_recommended_items = [item for item, count in sorted_recommended_items][:top_n]

        recommendations[user_id] = sorted_recommended_items

    return recommendations

# Calculate precision, recall, and F1-score
def calculate_metrics(recommendations, validation_data):
    precision_total = 0
    recall_total = 0
    total_users = len(recommendations)

    for user_id in recommendations:
        recommended_items = set(recommendations[user_id])
        actual_items = set()

        # Get the user's true purchase records (complete purchase history)
        original_purchases = eval(data[data['Customer ID'] == user_id]['Purchase History'].values[0])
        for purchase in original_purchases:
            actual_items.add(purchase['Product Category'])

        # Calculate intersection
        intersection = recommended_items.intersection(actual_items)

        # Calculate precision and recall
        precision = len(intersection) / len(recommended_items) if len(recommended_items) > 0 else 0
        recall = len(intersection) / len(actual_items) if len(actual_items) > 0 else 0

        precision_total += precision
        recall_total += recall

    # Calculate average precision and recall
    avg_precision = precision_total / total_users
    avg_recall = recall_total / total_users

    # Calculate F1-score
    f1 = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0

    return avg_precision, avg_recall, f1

# Generate user similarity matrix
def generate_user_similarity_matrix(interaction_matrix):
    # Fill missing values with 0
    filled_matrix = interaction_matrix.fillna(0)
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(filled_matrix)
    # Convert to DataFrame
    similarity_df = pd.DataFrame(similarity_matrix, index=interaction_matrix.index, columns=interaction_matrix.index)
    return similarity_df

KNN Recommendation part -Generate recommendation list and calculate metrics

In [None]:
# Generate recommendation list
recommendations = generate_recommendations(validation_data, interaction_matrix)

# Print debug information
print("Example of recommendation list:")
for user_id, items in list(recommendations.items())[:5]:
    print(f"Recommendation list for user {user_id}: {items}")

# Calculate metrics
precision, recall, f1 = calculate_metrics(recommendations, validation_data)

# Output results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

KNN recommendation part - Visualization

In [None]:
# Visualize user-product category interaction matrix
def visualize_interaction_matrix(interaction_matrix):
    # Use PCA for dimensionality reduction
    pca = PCA(n_components=2)
    interaction_matrix_filled = interaction_matrix.fillna(0)
    user_embeddings = pca.fit_transform(interaction_matrix_filled)

    # Plot scatter plot
    plt.figure(figsize=(10, 8))
    plt.scatter(user_embeddings[:, 0], user_embeddings[:, 1], alpha=0.5)
    plt.title('User Embeddings using PCA')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()

# Visualize recommendation accuracy
def visualize_recommendation_accuracy(recommendations, validation_data, interaction_matrix, data):
    # Collect actual purchased categories and recommended categories
    actual_categories = []
    recommended_categories = []

    for user_id in recommendations:
        # Get actual purchase records
        actual_purchases = eval(data[data['Customer ID'] == user_id]['Purchase History'].values[0])
        actual_categories.extend([purchase['Product Category'] for purchase in actual_purchases])

        # Get recommended records
        recommended_categories.extend(recommendations[user_id])

    # Calculate frequency
    actual_counts = pd.Series(actual_categories).value_counts().sort_index()
    recommended_counts = pd.Series(recommended_categories).value_counts().sort_index()

    # Plot comparison chart
    plt.figure(figsize=(12, 8))

    # Actual purchase category distribution
    ax1 = plt.subplot(2, 1, 1)
    actual_counts.plot(kind='bar', color='skyblue', ax=ax1)
    ax1.set_title('Actual Purchase Distribution')
    ax1.set_ylabel('Frequency')

    # Recommended category distribution
    ax2 = plt.subplot(2, 1, 2)
    recommended_counts.plot(kind='bar', color='orange', ax=ax2)
    ax2.set_title('Recommended Distribution')
    ax2.set_ylabel('Frequency')

    plt.tight_layout()
    plt.show()

# Perform visualization
visualize_interaction_matrix(interaction_matrix)
visualize_recommendation_accuracy(recommendations, validation_data, interaction_matrix, data)

Association Analysis part - Data preparation

In [None]:
# Read the dataset
df = pd.read_csv(r"C:\Users\Dell\Desktop\E-commerce04.csv")

# Define age grouping function
def get_age_group(age):
    if age < 30:
        return '<30岁'
    elif 30 <= age <= 50:
        return '30-50岁'
    else:
        return '>50岁'

# Build the transaction dataset
transactions = []
for idx, row in df.iterrows():
    # Extract user attributes
    gender = row['Gender']
    income_tier = row['Income Tier']
    region = row['Region']
    time_on_site_tier = row['Time on Site Tier']
    age_group = get_age_group(row['Age'])

    # Parse purchase history and remove duplicates
    purchase_cats = list({item['Product Category'] for item in json.loads(row['Purchase History'])})

    # Combine attribute items and purchase category items
    transaction = [
        gender, income_tier, region, time_on_site_tier, age_group,
        *purchase_cats  # Unfold purchase category items
    ]
    # Clean data to ensure all items are of string type
    transaction = [str(item) for item in transaction if pd.notna(item)]
    transactions.append(transaction)

# Generate a list of user IDs (assuming user IDs are unique)
user_ids = df['Customer ID'].tolist()

# Split the dataset (80% training, 20% validation)
train_users, val_users = train_test_split(user_ids, test_size=0.2, random_state=42)

# Filter transactions for training and validation sets
train_transactions = [t for u, t in zip(user_ids, transactions) if u in train_users]
val_transactions = [t for u, t in zip(user_ids, transactions) if u in val_users]

Association analysis part -Generating frequent itemsets and rules

In [None]:
# Initialize transaction encoder
te = TransactionEncoder()
te_ary = te.fit_transform(train_transactions)
train_df = pd.DataFrame(te_ary, columns=te.columns_)

# Generate frequent itemsets (minimum support set to 5%)
frequent_itemsets = apriori(
    train_df,
    min_support=0.05,
    use_colnames=True,
    max_len=5  # Limit itemset length (total items in antecedent and consequent)
)

# Calculate rule metrics (support, confidence, lift)
rules = association_rules(
    frequent_itemsets,
    metric="confidence",
    min_threshold=0.5  # Minimum confidence 50%
)

# Filter rules: consequent is a single purchase category, antecedent contains at least one attribute item
attribute_items = {'Male', 'Female', '<30岁', '30-50岁', '>50岁',
                   'Low', 'Medium', 'High', 'Very High',
                   'Region 1', 'Region 2', 'Region 3', 'Region 4',
                   'Region 5', 'Region 6', 'Region 7', 'Region 8',
                   'Short', 'Medium', 'Long', 'Very Long'}

valid_rules = rules[
    (rules['consequents'].apply(lambda x: len(x) == 1)) &  # Consequent is a single category
    (rules['antecedents'].apply(lambda x: not x.isdisjoint(attribute_items)))  # Antecedent contains attribute items
    ].copy()

# Correctly calculate lift
valid_rules['Lift'] = valid_rules['confidence'] / valid_rules['consequent support']

# Filter valid rules (lift > 1)
final_rules = valid_rules[valid_rules['Lift'] > 1].sort_values(by='confidence', ascending=False)

print(f"Total transactions (training set): {len(train_transactions)}")
print(f"Number of generated rules: {len(final_rules)}")
print("\nTop 5 effective rules:")
print(final_rules[['antecedents', 'consequents', 'Lift']].head())

# Group by core attributes (e.g., income tier) to reduce search space
for tier in df['Income Tier'].unique():
    tier_transactions = [t for t in train_transactions if tier in t]
    # Apply Apriori algorithm separately for each group

Association analysis part -validation set operation

In [None]:
# Encode the validation set
val_te_ary = te.transform(val_transactions)  # Use the encoder from the training set
val_df = pd.DataFrame(val_te_ary, columns=te.columns_)

# Calculate rule coverage on the validation set
def rule_coverage(rule, val_df):
    antecedent = set(rule['antecedents'])
    consequent = set(rule['consequents'])
    antecedent_mask = val_df[list(antecedent)].all(axis=1)
    return val_df[antecedent_mask][list(consequent)].any(axis=1).mean()

final_rules['validation_coverage'] = final_rules.apply(
    lambda x: rule_coverage(x, val_df), axis=1
)

# Calculate precision, recall, F1 score
def calculate_precision_recall_f1(test_df, rules):
    precision_numerator = 0  # ∑|R(u)∩T(u)|
    precision_denominator = 0  # ∑|R(u)|
    recall_denominator = 0  # ∑|T(u)|

    for idx, row in test_df.iterrows():
        # Extract user attributes (antecedent candidates)
        gender = row['Gender']
        income_tier = row['Income Tier']
        region = row['Region']
        time_on_site_tier = row['Time on Site Tier']
        age_group = get_age_group(row['Age'])
        user_attributes = {gender, income_tier, region, time_on_site_tier, age_group}

        # Parse true purchase categories T(u)
        purchase_history = json.loads(row['Purchase History'])
        T_u = {item['Product Category'] for item in purchase_history}
        recall_denominator += len(T_u)

        # Generate recommendation list R(u): match all rules whose antecedent is a subset of user attributes
        R_u = set()
        for _, rule in rules.iterrows():
            antecedent = set(rule['antecedents'])
            if antecedent.issubset(user_attributes):
                # Extract the unique element from frozenset using next(iter())
                consequent_item = next(iter(rule['consequents']))
                R_u.add(consequent_item)

        precision_denominator += len(R_u)
        intersection = R_u & T_u
        precision_numerator += len(intersection)

    precision = precision_numerator / precision_denominator if precision_denominator != 0 else 0
    recall = precision_numerator / recall_denominator if recall_denominator != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return {
        'Precision': precision,
        'Recall': recall,
        'F1': f1
    }

test_df = pd.read_csv(r"C:\Users\Dell\Desktop\E-commerce06.csv")
metrics = calculate_precision_recall_f1(test_df, final_rules)

print("\nRecommendation Performance Metrics:")
print(f"Precision: {metrics['Precision']:.4f}")
print(f"Recall: {metrics['Recall']:.4f}")
print(f"F1 Score: {metrics['F1']:.4f}")

Association Analysis Part - Visualization

In [None]:
# 1. Association Rule Visualization (Support vs Confidence with Lift as Bubble Size)
plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=final_rules,
    x='support', y='confidence',
    size='Lift', alpha=0.7,
    palette='viridis'
)
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Association Rule Visualization (Support vs Confidence, Bubble Size Indicates Lift)')
plt.legend(title='Lift', bbox_to_anchor=(1, 1))
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()

# 3. Rule Consequent Distribution (Recommended Product Category Frequency)
consequent_counts = final_rules['consequents'].apply(lambda x: next(iter(x))).value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=consequent_counts.values, y=consequent_counts.index, palette='plasma')
plt.xlabel('Number of Rules')
plt.ylabel('Recommended Product Categories')
plt.title('Distribution of Recommended Product Categories')
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Display all charts
plt.show()