In [1]:
#libraries
import pandas as pd
import numpy as np

In [5]:
# Load datasets
df_customers = pd.read_csv("/kaggle/input/ecommerce-transactions-dataset1/Customers.csv")
df_products = pd.read_csv("/kaggle/input/ecommerce-transactions-dataset1/Products.csv")
df_transactions = pd.read_csv("/kaggle/input/ecommerce-transactions-dataset1/Transactions.csv")

# Merging Data from all 3 files

In [7]:
df_merged = df_transactions.merge(df_customers, on="CustomerID", how="inner").merge(df_products, on="ProductID", how="inner")

# Create aggregated customer profiles
customer_profiles = df_merged.groupby("CustomerID").agg({
    "ProductName": lambda x: " ".join(x),
    "Category": lambda x: " ".join(x),    
    "TotalValue": "sum"
}).reset_index()

# Normalize total transaction values for cosine similarity
customer_profiles["NormalizedValue"] = customer_profiles["TotalValue"] / customer_profiles["TotalValue"].max()

# Using cosine similarity to find similar data per each customer

In [8]:
# Create a matrix of normalized values for cosine similarity (via dot product)
customer_matrix = customer_profiles[["NormalizedValue"]].values
cosine_sim_matrix = customer_matrix @ customer_matrix.T 

# Get customer IDs
customer_ids = customer_profiles["CustomerID"].tolist()

# Find top 3 similar customers for each customer
lookalike_map = {}
for i, customer_id in enumerate(customer_ids):
    similarity_scores = [(customer_ids[j], score) for j, score in enumerate(cosine_sim_matrix[i])]
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    lookalike_map[customer_id] = sorted_scores[1:4]  # Excluding self

In [10]:
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_map.keys(),
    "Lookalikes": [str(value) for value in lookalike_map.values()]
})

lookalike_df.to_csv("Lookalike.csv", index=False)

In [18]:
lookalike_df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0054', 0.23673566027727252), ('C0065', 0.2..."
1,C0002,"[('C0054', 0.13145755095360484), ('C0065', 0.1..."
2,C0003,"[('C0054', 0.192335903141574), ('C0065', 0.183..."
3,C0004,"[('C0054', 0.3779053493511921), ('C0065', 0.36..."
4,C0005,"[('C0054', 0.14356067322968377), ('C0065', 0.1..."


In [21]:
lookalike_df.to_csv('/kaggle/working/Debug_Lookalike.csv', index=False)

In [23]:
#chech the output files for the csv of the resultant lookalike file