# Lookalike Model

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [12]:
customers

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15
...,...,...,...,...
195,C0196,Laura Watts,Europe,2022-06-07
196,C0197,Christina Harvey,Europe,2023-03-21
197,C0198,Rebecca Ray,Europe,2022-02-27
198,C0199,Andrea Jenkins,Europe,2022-12-03


In [5]:
transactions

Unnamed: 0_level_0,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68
...,...,...,...,...,...,...
T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86
T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86
T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86
T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86


In [6]:
products

Unnamed: 0_level_0,ProductName,Category,Price
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P001,ActiveWear Biography,Books,169.30
P002,ActiveWear Smartwatch,Electronics,346.30
P003,ComfortLiving Biography,Books,44.12
P004,BookWorld Rug,Home Decor,95.69
P005,TechPro T-Shirt,Clothing,429.31
...,...,...,...
P096,SoundWave Headphones,Electronics,307.47
P097,BookWorld Cookbook,Books,319.34
P098,SoundWave Laptop,Electronics,299.93
P099,SoundWave Mystery Book,Books,354.29


In [13]:
merged = transactions.merge(products, on="ProductID")
merged

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,SoundWave Smartwatch,Electronics,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,SoundWave Smartwatch,Electronics,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,SoundWave Smartwatch,Electronics,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,SoundWave Smartwatch,Electronics,459.86


### Aggregate transaction data per customer

In [20]:
customer_txn = merged.groupby("CustomerID").agg(
    total_spend=("TotalValue", "sum"),
    total_orders=("TransactionID", "count"),
    unique_products=("ProductID", "nunique")
).reset_index()

In [21]:
customer_txn

Unnamed: 0,CustomerID,total_spend,total_orders,unique_products
0,C0001,3354.52,5,5
1,C0002,1862.74,4,4
2,C0003,2725.38,4,4
3,C0004,5354.88,8,8
4,C0005,2034.24,3,3
...,...,...,...,...
194,C0196,4982.88,4,3
195,C0197,1928.65,3,3
196,C0198,931.83,2,2
197,C0199,1979.28,4,4


### Merge with customer profile data

In [34]:
customer_data = customers.merge(customer_txn, on="CustomerID", how="left").fillna(0)
customer_data

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,total_spend,total_orders,unique_products
0,C0001,Lawrence Carroll,South America,2022-07-10,3354.52,5.0,5.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1862.74,4.0,4.0
2,C0003,Michael Rivera,South America,2024-03-07,2725.38,4.0,4.0
3,C0004,Kathleen Rodriguez,South America,2022-10-09,5354.88,8.0,8.0
4,C0005,Laura Weber,Asia,2022-08-15,2034.24,3.0,3.0
...,...,...,...,...,...,...,...
195,C0196,Laura Watts,Europe,2022-06-07,4982.88,4.0,3.0
196,C0197,Christina Harvey,Europe,2023-03-21,1928.65,3.0,3.0
197,C0198,Rebecca Ray,Europe,2022-02-27,931.83,2.0,2.0
198,C0199,Andrea Jenkins,Europe,2022-12-03,1979.28,4.0,4.0


In [44]:
encoder = OneHotEncoder(sparse_output=False, drop="first")
region_encoded = encoder.fit_transform(customer_data[["Region"]])
region_encoded

array([[0., 0., 1.],
       [0., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0

In [46]:
scaler = StandardScaler()
numeric_features = scaler.fit_transform(customer_data[["total_spend", "total_orders", "unique_products"]])

feature_matrix = pd.DataFrame(
    data = np.hstack((region_encoded, numeric_features)),
    index = customer_data["CustomerID"]
)
feature_matrix

Unnamed: 0_level_0,0,1,2,3,4,5
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C0001,0.0,0.0,1.0,-0.051884,0.000000,0.060991
C0002,0.0,0.0,0.0,-0.862714,-0.451294,-0.408172
C0003,0.0,0.0,1.0,-0.393842,-0.451294,-0.408172
C0004,0.0,0.0,1.0,1.035375,1.353881,1.468481
C0005,0.0,0.0,0.0,-0.769499,-0.902587,-0.877335
...,...,...,...,...,...,...
C0196,1.0,0.0,0.0,0.833181,-0.451294,-0.877335
C0197,1.0,0.0,0.0,-0.826890,-0.902587,-0.877335
C0198,1.0,0.0,0.0,-1.368694,-1.353881,-1.346498
C0199,1.0,0.0,0.0,-0.799371,-0.451294,-0.408172


In [49]:
similarity_matrix = cosine_similarity(feature_matrix)

In [70]:
lookalike_results = {}

In [71]:
for idx, cust_id in enumerate(customer_data["CustomerID"][:20]):
    similarity_scores = similarity_matrix[idx]
    similar_indices = similarity_scores.argsort()[::-1] 
    similar_indices = [i for i in similar_indices if i != idx][:3] 
    similar_customers = [(customer_data.iloc[i]["CustomerID"], similarity_scores[i]) for i in similar_indices]
    lookalike_results[cust_id] = similar_customers

In [72]:
lookalike_results

{'C0001': [('C0200', 1.0000000000000002),
  ('C0199', 0.9999294396666163),
  ('C0198', 0.9998560046975373)],
 'C0002': [('C0142', 0.9913138596134357),
  ('C0043', 0.9852383954803484),
  ('C0177', 0.9669111144073871)],
 'C0003': [('C0133', 0.9977086577126446),
  ('C0192', 0.9702819492424226),
  ('C0031', 0.9647561768614069)],
 'C0004': [('C0108', 0.9873711250794179),
  ('C0113', 0.9834220576647593),
  ('C0155', 0.9805518646086689)],
 'C0005': [('C0159', 0.999926961817998),
  ('C0123', 0.9997414927992679),
  ('C0180', 0.9996425484594529)],
 'C0006': [('C0158', 0.97516012423523),
  ('C0168', 0.9572035174790451),
  ('C0187', 0.8426412295126363)],
 'C0007': [('C0193', 0.9993159097916283),
  ('C0140', 0.9982090071917907),
  ('C0080', 0.9928159382591879)],
 'C0008': [('C0109', 0.9836626828753862),
  ('C0139', 0.9834677483898591),
  ('C0098', 0.9658486043752895)],
 'C0009': [('C0198', 0.9778991954168432),
  ('C0197', 0.972020252615546),
  ('C0060', 0.964215498296618)],
 'C0010': [('C0199', 0.9

In [67]:
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient="index")

In [69]:
lookalike_df.head(20)

Unnamed: 0,0,1,2
C0001,"(C0200, 1.0000000000000002)","(C0199, 0.9999294396666163)","(C0198, 0.9998560046975373)"
C0002,"(C0142, 0.9913138596134357)","(C0043, 0.9852383954803484)","(C0177, 0.9669111144073871)"
C0003,"(C0133, 0.9977086577126446)","(C0192, 0.9702819492424226)","(C0031, 0.9647561768614069)"
C0004,"(C0108, 0.9873711250794179)","(C0113, 0.9834220576647593)","(C0155, 0.9805518646086689)"
C0005,"(C0159, 0.999926961817998)","(C0123, 0.9997414927992679)","(C0180, 0.9996425484594529)"
C0006,"(C0158, 0.97516012423523)","(C0168, 0.9572035174790451)","(C0187, 0.8426412295126363)"
C0007,"(C0193, 0.9993159097916283)","(C0140, 0.9982090071917907)","(C0080, 0.9928159382591879)"
C0008,"(C0109, 0.9836626828753862)","(C0139, 0.9834677483898591)","(C0098, 0.9658486043752895)"
C0009,"(C0198, 0.9778991954168432)","(C0197, 0.972020252615546)","(C0060, 0.964215498296618)"
C0010,"(C0199, 0.9969377387108523)","(C0121, 0.9843097775563725)","(C0166, 0.9746130222683238)"
