In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

In [103]:
customers = pd.read_csv("/mnt/c/users/ankku/downloads/Customers.csv")
products = pd.read_csv("/mnt/c/users/ankku/downloads/products.csv")
transactions = pd.read_csv("/mnt/c/users/ankku/downloads/transactions.csv")

In [104]:
merged_df = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")
merged_df = merged_df.drop(columns=['Price_y']).rename(columns={'Price_x': 'Price'})
merged_df

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,CustomerName,Region,SignupDate,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics
...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,Jacob Holt,South America,2022-01-22,SoundWave Smartwatch,Electronics
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,Mrs. Kimberly Wright,North America,2024-04-07,SoundWave Smartwatch,Electronics
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,Tyler Haynes,North America,2024-09-21,SoundWave Smartwatch,Electronics
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Joshua Hamilton,Asia,2024-11-11,SoundWave Smartwatch,Electronics


For seeking similarity between customers, I am using following factors:
1. <b>TotalValue</b> or <b>Revenue</b>: It can give similarity in terms of purchasing power of two different customers
2. <b>Quantity</b>: It is important to get insights about customers who have bought more items but of lesser prices.
3. <b>TransactionID</b>: It shows the characteristic of customers whether they tend to buy more items in a single day or on multiple display_svg
4. <b>Region</b>: Using Demographic Data
5. <b>Category</b>: What type of product, customers are buying

In [105]:
customer_profile = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: x.mode()[0]  # Most frequent product category
}).reset_index()
customer_profile

Unnamed: 0,CustomerID,TotalValue,Quantity,TransactionID,Category
0,C0001,3354.52,12,5,Electronics
1,C0002,1862.74,10,4,Clothing
2,C0003,2725.38,14,4,Home Decor
3,C0004,5354.88,23,8,Books
4,C0005,2034.24,7,3,Electronics
...,...,...,...,...,...
194,C0196,4982.88,12,4,Home Decor
195,C0197,1928.65,9,3,Electronics
196,C0198,931.83,3,2,Clothing
197,C0199,1979.28,9,4,Electronics


In [106]:
encoded_region_df = pd.get_dummies(customers['Region'])
encoded_category_df = pd.get_dummies(customer_profile['Category'])
encoded_region_df = encoded_region_df.astype(int)
encoded_category_df = encoded_category_df.astype(int)
# encoded_category_df = encoded_category_df.replace({0.0: 0, 1.0: 1})
customer_profile = pd.concat([customer_profile, encoded_region_df, encoded_category_df], axis=1)
customer_profile = customer_profile.drop(['Category'], axis=1).dropna()
customer_profile

Unnamed: 0,CustomerID,TotalValue,Quantity,TransactionID,Asia,Europe,North America,South America,Books,Clothing,Electronics,Home Decor
0,C0001,3354.52,12.0,5.0,0,0,0,1,0.0,0.0,1.0,0.0
1,C0002,1862.74,10.0,4.0,1,0,0,0,0.0,1.0,0.0,0.0
2,C0003,2725.38,14.0,4.0,0,0,0,1,0.0,0.0,0.0,1.0
3,C0004,5354.88,23.0,8.0,0,0,0,1,1.0,0.0,0.0,0.0
4,C0005,2034.24,7.0,3.0,1,0,0,0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
194,C0196,4982.88,12.0,4.0,0,0,0,1,0.0,0.0,0.0,1.0
195,C0197,1928.65,9.0,3.0,0,1,0,0,0.0,0.0,1.0,0.0
196,C0198,931.83,3.0,2.0,0,1,0,0,0.0,1.0,0.0,0.0
197,C0199,1979.28,9.0,4.0,0,1,0,0,0.0,0.0,1.0,0.0


I have used Cosine Similarity to judge the Similarity between different Customers, which is helpful in easy interpretation

In [111]:
similarities = cosine_similarity(customer_profile.iloc[:20, 1:], customer_profile.iloc[:, 1:])
lookalike_dict = {}
for i, cust_id in enumerate(customers['CustomerID'][:20]):
    top_3_similar = np.argsort(similarities[i])[::-1][1:4]
    similar_customers = [(customers['CustomerID'][j], similarities[i][j]) for j in top_3_similar]
    lookalike_dict[cust_id] = tuple(similar_customers)
# lookalike_dict
lookalike_df = pd.DataFrame.from_dict(lookalike_dict, orient='index', columns=['Similar_Customer1_id_Score', 'Similar_Customer2_id_Score', 'Similar_Customer3_id_Score'])
lookalike_df

Unnamed: 0,Similar_Customer1_id_Score,Similar_Customer2_id_Score,Similar_Customer3_id_Score
C0001,"(C0102, 0.9999999527987777)","(C0120, 0.9999999483213668)","(C0012, 0.9999999184641456)"
C0002,"(C0134, 0.9999997517115803)","(C0034, 0.999999748270765)","(C0088, 0.9999997415653932)"
C0003,"(C0136, 0.9999998622632613)","(C0031, 0.9999998325155759)","(C0113, 0.9999997960570688)"
C0004,"(C0169, 0.9999999775304766)","(C0165, 0.9999999713088011)","(C0175, 0.999999956609213)"
C0005,"(C0146, 0.9999999319874456)","(C0007, 0.999999882727417)","(C0028, 0.9999998324578916)"
C0006,"(C0171, 0.9999999949479038)","(C0184, 0.9999999723630849)","(C0082, 0.9999999691163179)"
C0007,"(C0146, 0.9999999226903571)","(C0054, 0.9999999098700725)","(C0005, 0.999999882727417)"
C0008,"(C0047, 0.9999998888852705)","(C0098, 0.9999998696096729)","(C0038, 0.9999998531495534)"
C0009,"(C0197, 0.9999992701461256)","(C0103, 0.9999985041199906)","(C0172, 0.9999981485032108)"
C0010,"(C0111, 0.9999993430995275)","(C0091, 0.9999992851015111)","(C0134, 0.9999991169766563)"


In [112]:
lookalike_df["Customers with Similarity Score"] = lookalike_df[['Similar_Customer1_id_Score', 'Similar_Customer2_id_Score', 'Similar_Customer3_id_Score']].values.tolist()
lookalike_df = lookalike_df.drop(columns=['Similar_Customer1_id_Score', 'Similar_Customer2_id_Score', 'Similar_Customer3_id_Score'])
lookalike_df = lookalike_df.reset_index()
lookalike_df.columns=["Customer ID", "Customers with Similarity Score"]
lookalike_df

Unnamed: 0,Customer ID,Customers with Similarity Score
0,C0001,"[(C0102, 0.9999999527987777), (C0120, 0.999999..."
1,C0002,"[(C0134, 0.9999997517115803), (C0034, 0.999999..."
2,C0003,"[(C0136, 0.9999998622632613), (C0031, 0.999999..."
3,C0004,"[(C0169, 0.9999999775304766), (C0165, 0.999999..."
4,C0005,"[(C0146, 0.9999999319874456), (C0007, 0.999999..."
5,C0006,"[(C0171, 0.9999999949479038), (C0184, 0.999999..."
6,C0007,"[(C0146, 0.9999999226903571), (C0054, 0.999999..."
7,C0008,"[(C0047, 0.9999998888852705), (C0098, 0.999999..."
8,C0009,"[(C0197, 0.9999992701461256), (C0103, 0.999998..."
9,C0010,"[(C0111, 0.9999993430995275), (C0091, 0.999999..."


In [113]:
lookalike_df.to_csv("AnkKumar_Gupta_LookAlike.csv")