In [1]:
#Import necessary libraries


import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

In [2]:
#load dataset
customer=pd.read_csv(r'C:\Users\bivin\OneDrive\Desktop\Zeotap\Customers.csv')
product=pd.read_csv(r'C:\Users\bivin\OneDrive\Desktop\Zeotap\Products.csv')
transaction=pd.read_csv(r'C:\Users\bivin\OneDrive\Desktop\Zeotap\Transactions.csv')

In [3]:
# Merge datasets for analysis
data = transaction.merge(product, on="ProductID", how="left")
data = data.merge(customer, on="CustomerID", how="left")
data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,SoundWave Smartwatch,Electronics,459.86,Jacob Holt,South America,2022-01-22
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,SoundWave Smartwatch,Electronics,459.86,Mrs. Kimberly Wright,North America,2024-04-07
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,SoundWave Smartwatch,Electronics,459.86,Tyler Haynes,North America,2024-09-21
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,SoundWave Smartwatch,Electronics,459.86,Joshua Hamilton,Asia,2024-11-11


Create customer-level transaction features

In [4]:
customer_features = data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'nunique'),
    total_products=('ProductID', 'nunique'),
    avg_transaction=('TotalValue', 'mean')
).reset_index()

customer_features.head(5)

Unnamed: 0,CustomerID,total_spent,total_transactions,total_products,avg_transaction
0,C0001,3354.52,5,5,670.904
1,C0002,1862.74,4,4,465.685
2,C0003,2725.38,4,4,681.345
3,C0004,5354.88,8,8,669.36
4,C0005,2034.24,3,3,678.08


Add customer profile features (Region, SignupDate)

In [5]:
customer_profile = customer[['CustomerID', 'Region', 'SignupDate']]

In [6]:
customer_profile['SignupDate'] = pd.to_datetime(customer_profile['SignupDate'])
customer_profile['signupDays'] = (pd.to_datetime('today') - customer_profile['SignupDate']).dt.days
customer_profile

Unnamed: 0,CustomerID,Region,SignupDate,signupDays
0,C0001,South America,2022-07-10,935
1,C0002,Asia,2022-02-13,1082
2,C0003,South America,2024-03-07,329
3,C0004,South America,2022-10-09,844
4,C0005,Asia,2022-08-15,899
...,...,...,...,...
195,C0196,Europe,2022-06-07,968
196,C0197,Europe,2023-03-21,681
197,C0198,Europe,2022-02-27,1068
198,C0199,Europe,2022-12-03,789


In [7]:
customer_data = pd.merge(customer_features, customer_profile, on='CustomerID', how='left')
customer_data

Unnamed: 0,CustomerID,total_spent,total_transactions,total_products,avg_transaction,Region,SignupDate,signupDays
0,C0001,3354.52,5,5,670.904000,South America,2022-07-10,935
1,C0002,1862.74,4,4,465.685000,Asia,2022-02-13,1082
2,C0003,2725.38,4,4,681.345000,South America,2024-03-07,329
3,C0004,5354.88,8,8,669.360000,South America,2022-10-09,844
4,C0005,2034.24,3,3,678.080000,Asia,2022-08-15,899
...,...,...,...,...,...,...,...,...
194,C0196,4982.88,4,3,1245.720000,Europe,2022-06-07,968
195,C0197,1928.65,3,3,642.883333,Europe,2023-03-21,681
196,C0198,931.83,2,2,465.915000,Europe,2022-02-27,1068
197,C0199,1979.28,4,4,494.820000,Europe,2022-12-03,789


Normalize numerical features

In [8]:
sc = StandardScaler()
numeric_features = ['total_spent', 'total_transactions', 'total_products', 'avg_transaction', 'signupDays']
customer_data[numeric_features] = sc.fit_transform(customer_data[numeric_features])
customer_data

Unnamed: 0,CustomerID,total_spent,total_transactions,total_products,avg_transaction,Region,SignupDate,signupDays
0,C0001,-0.061701,-0.011458,0.050047,-0.070263,South America,2022-07-10,1.148752
1,C0002,-0.877744,-0.467494,-0.424204,-0.934933,Asia,2022-02-13,1.600431
2,C0003,-0.405857,-0.467494,-0.424204,-0.026271,South America,2024-03-07,-0.713270
3,C0004,1.032547,1.356650,1.472798,-0.076769,South America,2022-10-09,0.869141
4,C0005,-0.783929,-0.923530,-0.898455,-0.040028,Asia,2022-08-15,1.038137
...,...,...,...,...,...,...,...,...
194,C0196,0.829053,-0.467494,-0.898455,2.351666,Europe,2022-06-07,1.250149
195,C0197,-0.841689,-0.923530,-0.898455,-0.188326,Europe,2023-03-21,0.368300
196,C0198,-1.386975,-1.379566,-1.372705,-0.933964,Europe,2022-02-27,1.557414
197,C0199,-0.813993,-0.467494,-0.424204,-0.812176,Europe,2022-12-03,0.700146


 Prepare the data for similarity calculation

In [9]:
customer_data.set_index('CustomerID', inplace=True)

Calculate similarity between customers using COSINE similarity

In [10]:
similarity_matrix = cosine_similarity(customer_data[numeric_features])

In [11]:
def get_top_similar(customer_index, similarity_matrix, top_n=3):
    similarity_scores = list(enumerate(similarity_matrix[customer_index]))
    # Sort by similarity score (descending), skip self-comparison (index = customer_index)
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    return sorted_scores[1:top_n+1]

In [12]:
# Get top 3 similar customers for the first 20 customers (C0001–C0020)
customer_ids = customer_features["CustomerID"]
lookalike_results = {}


In [13]:
for i in range(20):  # First 20 customers
    similar_customers = get_top_similar(i, similarity_matrix)
    lookalike_results[customer_ids[i]] = [
        {"cust_id": customer_ids[j], "score": round(score, 4)}
        for j, score in similar_customers
    ]

In [14]:
# Display the lookalike results
lookalike_results

{'C0001': [{'cust_id': 'C0152', 'score': 0.9993},
  {'cust_id': 'C0160', 'score': 0.9647},
  {'cust_id': 'C0134', 'score': 0.9312}],
 'C0002': [{'cust_id': 'C0029', 'score': 0.9957},
  {'cust_id': 'C0192', 'score': 0.9803},
  {'cust_id': 'C0025', 'score': 0.9677}],
 'C0003': [{'cust_id': 'C0036', 'score': 0.9823},
  {'cust_id': 'C0177', 'score': 0.9759},
  {'cust_id': 'C0144', 'score': 0.9737}],
 'C0004': [{'cust_id': 'C0175', 'score': 0.998},
  {'cust_id': 'C0173', 'score': 0.9916},
  {'cust_id': 'C0108', 'score': 0.9855}],
 'C0005': [{'cust_id': 'C0073', 'score': 0.9998},
  {'cust_id': 'C0159', 'score': 0.9994},
  {'cust_id': 'C0112', 'score': 0.9974}],
 'C0006': [{'cust_id': 'C0066', 'score': 0.9631},
  {'cust_id': 'C0185', 'score': 0.9342},
  {'cust_id': 'C0044', 'score': 0.9113}],
 'C0007': [{'cust_id': 'C0193', 'score': 0.9947},
  {'cust_id': 'C0125', 'score': 0.9898},
  {'cust_id': 'C0176', 'score': 0.9762}],
 'C0008': [{'cust_id': 'C0090', 'score': 0.9907},
  {'cust_id': 'C0017

In [15]:
lookalike_map = {"cust_id": [], "lookalikes": []}

for cust_id, lookalikes in lookalike_results.items():
    lookalike_map["cust_id"].append(cust_id)
    lookalike_map["lookalikes"].append(lookalikes)

# Create a DataFrame for the lookalike map
df_lookalike = pd.DataFrame(lookalike_map)

df_lookalike

Unnamed: 0,cust_id,lookalikes
0,C0001,"[{'cust_id': 'C0152', 'score': 0.9993}, {'cust..."
1,C0002,"[{'cust_id': 'C0029', 'score': 0.9957}, {'cust..."
2,C0003,"[{'cust_id': 'C0036', 'score': 0.9823}, {'cust..."
3,C0004,"[{'cust_id': 'C0175', 'score': 0.998}, {'cust_..."
4,C0005,"[{'cust_id': 'C0073', 'score': 0.9998}, {'cust..."
5,C0006,"[{'cust_id': 'C0066', 'score': 0.9631}, {'cust..."
6,C0007,"[{'cust_id': 'C0193', 'score': 0.9947}, {'cust..."
7,C0008,"[{'cust_id': 'C0090', 'score': 0.9907}, {'cust..."
8,C0009,"[{'cust_id': 'C0077', 'score': 0.9969}, {'cust..."
9,C0010,"[{'cust_id': 'C0083', 'score': 0.9944}, {'cust..."


In [16]:
df_lookalike.to_csv('Bivin_Bhaskar_Lookalike.csv', index=False)