In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers_path = r"C:\Users\Aditi\Downloads\zeotap\Customers.csv"
products_path = r"C:\Users\Aditi\Downloads\zeotap\Products.csv"
transactions_path = r"C:\Users\Aditi\Downloads\zeotap\Transactions.csv"

customers = pd.read_csv(customers_path)
products = pd.read_csv(products_path)
transactions = pd.read_csv(transactions_path)

In [3]:
merged = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [4]:
customer_features = merged.groupby("CustomerID").agg({
    "TotalValue": "sum",          # Total spending
    "Quantity": "sum",            # Total quantity purchased
    "Category": lambda x: list(x) # List of purchased categories
}).reset_index()

In [5]:
scaler = StandardScaler()
customer_features[["TotalValue", "Quantity"]] = scaler.fit_transform(customer_features[["TotalValue", "Quantity"]])

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Transform the 'Category' column
category_encoded = mlb.fit_transform(customer_features["Category"])

# Convert to DataFrame for easier manipulation
category_encoded_df = pd.DataFrame(category_encoded, columns=mlb.classes_)

# Combine with the original DataFrame

In [8]:
customer_features = pd.concat([customer_features.drop(columns=["Category"]), category_encoded_df], axis=1)

In [9]:
similarity_matrix = cosine_similarity(customer_features.drop(columns=["CustomerID"]))

In [10]:
lookalikes = {}
for i, customer_id in enumerate(customer_features["CustomerID"]):
    # Get similarity scores for the current customer
    similarities = similarity_matrix[i]
    # Get top 3 most similar customers (excluding the customer itself)
    top_indices = similarities.argsort()[-4:-1][::-1]  # Exclude self, pick top 3
    top_customers = customer_features["CustomerID"].iloc[top_indices]
    top_scores = similarities[top_indices]
    lookalikes[customer_id] = list(zip(top_customers, top_scores))

In [11]:
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": lookalike}
    for cust_id, lookalike in lookalikes.items()
])
lookalike_df.to_csv("Lookalike.csv", index=False)

In [12]:
first_20 = {k: v for k, v in lookalikes.items() if k in customers["CustomerID"][:20].values}
print(first_20)

{'C0001': [('C0127', 0.9950160325267876), ('C0174', 0.9896788656787558), ('C0047', 0.9843295829543113)], 'C0002': [('C0062', 0.9860668430423023), ('C0144', 0.9821131770618999), ('C0159', 0.9811638438293809)], 'C0003': [('C0106', 0.9937856712043361), ('C0166', 0.9597809118655212), ('C0026', 0.9507447219675769)], 'C0004': [('C0012', 0.9762028276156669), ('C0018', 0.9581532942889139), ('C0148', 0.956398703536766)], 'C0005': [('C0140', 0.9914576584599086), ('C0007', 0.9870715284016797), ('C0199', 0.9859994968064968)], 'C0006': [('C0079', 0.9999745422306183), ('C0187', 0.9902701822379437), ('C0139', 0.9828509901211641)], 'C0007': [('C0005', 0.9870715284016797), ('C0069', 0.9793519335053376), ('C0199', 0.977666297563826)], 'C0008': [('C0169', 0.99827404983291), ('C0017', 0.993706181927608), ('C0162', 0.9931181083625585)], 'C0009': [('C0198', 0.9999798466237244), ('C0020', 0.9198842076966726), ('C0130', 0.9192111798461216)], 'C0010': [('C0176', 0.9837306689747543), ('C0142', 0.982860060068783