# **Data Science Assignment: eCommerce Transactions Dataset**

# **Task 2: Lookalike Model**

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt


In [None]:
# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [None]:
# Merge the datasets for Task 2 (Lookalike Model)
# Merging transactions with customers based on 'CustomerID'

merged_data = pd.merge(transactions, customers, on='CustomerID')

# Merging with products based on 'ProductID'
merged_data = pd.merge(merged_data, products, on='ProductID')

# Inspect merged data
print(merged_data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [None]:
# Step 2: Preprocess customer profiles

customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': lambda x: list(x)  # Products purchased
}).reset_index()

# Display the customer profiles
customer_profiles.head()


Unnamed: 0,CustomerID,TotalValue,Quantity,ProductID
0,C0001,3354.52,12,"[P054, P022, P096, P083, P029]"
1,C0002,1862.74,10,"[P095, P004, P019, P071]"
2,C0003,2725.38,14,"[P025, P006, P035, P002]"
3,C0004,5354.88,23,"[P049, P053, P038, P025, P097, P024, P008, P077]"
4,C0005,2034.24,7,"[P025, P039, P012]"


In [None]:
# Step 3: One-hot encoding for ProductID

product_dummies = pd.get_dummies(merged_data[['CustomerID', 'ProductID']], columns=['ProductID'])
customer_encoded = product_dummies.groupby('CustomerID').sum()

# Display the encoded customer data
customer_encoded.head()


Unnamed: 0_level_0,ProductID_P001,ProductID_P002,ProductID_P003,ProductID_P004,ProductID_P005,ProductID_P006,ProductID_P007,ProductID_P008,ProductID_P009,ProductID_P010,...,ProductID_P091,ProductID_P092,ProductID_P093,ProductID_P094,ProductID_P095,ProductID_P096,ProductID_P097,ProductID_P098,ProductID_P099,ProductID_P100
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
C0002,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
C0003,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0004,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
C0005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Step 4: Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_encoded)

# Convert to DataFrame for easier interpretation
similarity_df = pd.DataFrame(similarity_matrix, index=customer_encoded.index, columns=customer_encoded.index)

# Display the similarity matrix
similarity_df.head()


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.0,0.0,0.0,0.0,0.0,0.258199,0.0,0.0,0.0,...,0.2,0.0,0.0,0.338062,0.0,0.0,0.0,0.0,0.223607,0.0
C0002,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.316228,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C0003,0.0,0.0,1.0,0.176777,0.288675,0.0,0.0,0.158114,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C0004,0.0,0.0,0.176777,1.0,0.204124,0.0,0.0,0.111803,0.0,0.176777,...,0.0,0.0,0.0,0.133631,0.0,0.0,0.0,0.0,0.176777,0.0
C0005,0.0,0.0,0.288675,0.204124,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.288675,0.0,0.218218,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Step 5: Generate lookalikes
lookalikes = {}
for cust_id in customer_profiles['CustomerID'][:20]:
    # Get the top 3 most similar customers (excluding the customer itself)
    similar_customers = similarity_df[cust_id].sort_values(ascending=False)[1:4]  # Excluding self
    lookalikes[cust_id] = list(similar_customers.index), list(similar_customers.values)

# Display the lookalikes for each customer
for cust_id, (similar_ids, scores) in lookalikes.items():
    print(f"Customer {cust_id} - Lookalikes: {similar_ids}, Scores: {scores}")


Customer C0001 - Lookalikes: ['C0097', 'C0020', 'C0190'], Scores: [0.4472135954999579, 0.4472135954999579, 0.39999999999999997]
Customer C0002 - Lookalikes: ['C0109', 'C0008', 'C0071'], Scores: [0.41602514716892186, 0.31622776601683794, 0.2886751345948129]
Customer C0003 - Lookalikes: ['C0134', 'C0181', 'C0144'], Scores: [0.4472135954999579, 0.4082482904638631, 0.35355339059327373]
Customer C0004 - Lookalikes: ['C0070', 'C0065', 'C0182'], Scores: [0.35355339059327373, 0.33541019662496846, 0.3162277660168379]
Customer C0005 - Lookalikes: ['C0096', 'C0003', 'C0168'], Scores: [0.5163977794943223, 0.2886751345948129, 0.2886751345948129]
Customer C0006 - Lookalikes: ['C0058', 'C0171', 'C0040'], Scores: [0.5, 0.4472135954999579, 0.4472135954999579]
Customer C0007 - Lookalikes: ['C0020', 'C0140', 'C0112'], Scores: [0.5773502691896258, 0.408248290463863, 0.3333333333333334]
Customer C0008 - Lookalikes: ['C0091', 'C0143', 'C0002'], Scores: [0.33541019662496846, 0.31622776601683794, 0.3162277660

In [None]:
# Step 6: Prepare and Save Lookalike.csv
lookalikes_df = pd.DataFrame({
    "CustomerID": list(lookalikes.keys()),
    "Lookalikes": [x[0] for x in lookalikes.values()],
    "Scores": [x[1] for x in lookalikes.values()]
})

# Save the result to a CSV file
lookalikes_df.to_csv("Lookalike.csv", index=False)

# Display the final lookalikes DataFrame
lookalikes_df.head()


Unnamed: 0,CustomerID,Lookalikes,Scores
0,C0001,"[C0097, C0020, C0190]","[0.4472135954999579, 0.4472135954999579, 0.399..."
1,C0002,"[C0109, C0008, C0071]","[0.41602514716892186, 0.31622776601683794, 0.2..."
2,C0003,"[C0134, C0181, C0144]","[0.4472135954999579, 0.4082482904638631, 0.353..."
3,C0004,"[C0070, C0065, C0182]","[0.35355339059327373, 0.33541019662496846, 0.3..."
4,C0005,"[C0096, C0003, C0168]","[0.5163977794943223, 0.2886751345948129, 0.288..."
