In [13]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
customers_df = pd.read_csv(r"C:\Users\bhumu\Downloads\Customers.csv")
products_df = pd.read_csv(r"C:\Users\bhumu\Downloads\Products.csv")
transactions_df = pd.read_csv(r"C:\Users\bhumu\Downloads\Transactions.csv")

In [15]:
# Merge Transactions with Products to include product details
transactions_products_df = transactions_df.merge(products_df, on="ProductID", how="left")

# Merge the result with Customers to include customer details
merged_df = transactions_products_df.merge(customers_df, on="CustomerID", how="left")

# Convert TransactionDate to datetime
merged_df["TransactionDate"] = pd.to_datetime(merged_df["TransactionDate"])
merged_df["SignupDate"] = pd.to_datetime(merged_df["SignupDate"])

# Aggregate transaction data at the customer level
customer_features = merged_df.groupby("CustomerID").agg(
    total_spending=("TotalValue", "sum"),
    avg_spending=("TotalValue", "mean"),
    transaction_count=("TransactionID", "count"),
    unique_products=("ProductID", "nunique"),
    signup_region=("Region", "first"),
    first_transaction=("TransactionDate", "min"),
    last_transaction=("TransactionDate", "max"),
).reset_index()

# Calculate recency (days since last transaction)
customer_features["recency_days"] = (
    merged_df["TransactionDate"].max() - customer_features["last_transaction"]
).dt.days

# One-hot encode regions
customer_features = pd.get_dummies(customer_features, columns=["signup_region"], prefix="region")

# Drop unnecessary columns
customer_features = customer_features.drop(columns=["first_transaction", "last_transaction"])

# Normalize numerical features
numerical_cols = ["total_spending", "avg_spending", "transaction_count", "unique_products", "recency_days"]
scaler = StandardScaler()
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

# Extract feature matrix and CustomerID
feature_matrix = customer_features.drop(columns=["CustomerID"]).values
customer_ids = customer_features["CustomerID"].values

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(feature_matrix)

# Find top 3 similar customers for CustomerIDs C0001–C0020
lookalike_map = {}
for i in range(20):  # First 20 customers (C0001–C0020)
    customer_id = customer_ids[i]
    similarities = list(enumerate(similarity_matrix[i]))
    # Exclude self and sort by similarity score
    top_similarities = sorted(
        [sim for sim in similarities if sim[0] != i], key=lambda x: x[1], reverse=True
    )[:3]
    lookalike_map[customer_id] = [(customer_ids[sim[0]], round(sim[1], 4)) for sim in top_similarities]

# Convert the lookalike map to a DataFrame
lookalike_df = pd.DataFrame(
    [
        {"CustomerID": cust_id, "Lookalikes": lookalikes}
        for cust_id, lookalikes in lookalike_map.items()
    ]
)

# Save to Lookalike.csv
lookalike_csv_path = "Lookalike.csv"
lookalike_df.to_csv(lookalike_csv_path, index=False)

print(f"Lookalike.csv saved to {lookalike_csv_path}")


Lookalike.csv saved to Lookalike.csv
