# Task 2 : Lookalike Model 🙂

# Import necessary libraries 📲

In [None]:
!pip install scikit-learn

In [42]:
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


# Load the datasets 🖨

In [43]:
# Load the datasets

customers    =  pd.read_csv("C:/Users/HP/Desktop/ZeoTap/Data/Customers.csv")
products     =  pd.read_csv("C:/Users/HP/Desktop/ZeoTap/Data/Products.csv")
transactions =  pd.read_csv("C:/Users/HP/Desktop/ZeoTap/Data/Transactions.csv")


# Understanding the Data 👨🏽‍💻

In [44]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [45]:
customers.tail()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
195,C0196,Laura Watts,Europe,2022-06-07
196,C0197,Christina Harvey,Europe,2023-03-21
197,C0198,Rebecca Ray,Europe,2022-02-27
198,C0199,Andrea Jenkins,Europe,2022-12-03
199,C0200,Kelly Cross,Asia,2023-06-11


In [46]:
customers.shape

(200, 4)

In [47]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [48]:
products.tail()

Unnamed: 0,ProductID,ProductName,Category,Price
95,P096,SoundWave Headphones,Electronics,307.47
96,P097,BookWorld Cookbook,Books,319.34
97,P098,SoundWave Laptop,Electronics,299.93
98,P099,SoundWave Mystery Book,Books,354.29
99,P100,HomeSense Sweater,Clothing,126.34


In [49]:
products.shape

(100, 4)

In [50]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [51]:
transactions.tail()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86
999,T00992,C0024,P037,2024-04-21 10:52:24,1,459.86,459.86


In [52]:
transactions.shape

(1000, 7)

# Steps for Lookalike Model 🤝

### Step 1: Merge datasets to enrich the transaction data

In [53]:

merged_data = transactions.merge(customers, on="CustomerID", how="left") \
                                       .merge(products, on="ProductID", how="left")


In [54]:
merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [55]:
merged_data.describe()

Unnamed: 0,Quantity,TotalValue,Price_x,Price_y
count,1000.0,1000.0,1000.0,1000.0
mean,2.537,689.99556,272.55407,272.55407
std,1.117981,493.144478,140.73639,140.73639
min,1.0,16.08,16.08,16.08
25%,2.0,295.295,147.95,147.95
50%,3.0,588.88,299.93,299.93
75%,4.0,1011.66,404.4,404.4
max,4.0,1991.04,497.76,497.76


### Step 2: Aggregate customer-product data

In [56]:
# Total spending and purchase frequency per product category for each customer
customer_features = merged_data.groupby(["CustomerID", "Category"]).agg(
    TotalSpending=("TotalValue", "sum"),
    PurchaseFrequency=("Quantity", "sum")
).reset_index()

# Pivot the data to create a feature matrix for customers
customer_feature_matrix = customer_features.pivot_table(
    index="CustomerID",
    columns="Category",
    values=["TotalSpending", "PurchaseFrequency"],
    fill_value=0
)

# Flatten multi-level columns
customer_feature_matrix.columns = [
    f"{stat}_{category}" for stat, category in customer_feature_matrix.columns
]

# Reset index to include CustomerID
customer_feature_matrix.reset_index(inplace=True)


### Step 3: Normalize the feature matrix

In [57]:
scaler = StandardScaler()
customer_ids = customer_feature_matrix["CustomerID"]
feature_data = customer_feature_matrix.drop("CustomerID", axis=1)

# Scale the feature data
scaled_features = scaler.fit_transform(feature_data)


### Step 4: Compute similarity scores using cosine similarity

In [58]:
similarity_matrix = cosine_similarity(scaled_features)

### Step 5: Generate top 3 similar customers for the first 20 customers (C0001 - C0020)

In [59]:
lookalike_map = {}
first_20_customers = customer_ids[:20]

for idx, customer in enumerate(first_20_customers):
    # Get similarity scores for this customer
    customer_similarities = similarity_matrix[idx]
    
    # Pair scores with corresponding customer IDs
    similar_customers = sorted(
        [
            (customer_ids[j], customer_similarities[j]) 
            for j in range(len(customer_ids)) if j != idx
        ],
        key=lambda x: x[1], reverse=True
    )
    
    # Select the top 3 most similar customers
    lookalike_map[customer] = similar_customers[:3]


### Step 6: Save the lookalike map to a CSV file

In [60]:
lookalike_data = []

for customer, similar_list in lookalike_map.items():
    for similar_customer, score in similar_list:
        lookalike_data.append({"CustomerID": customer, 
                               "SimilarCustomerID": similar_customer, 
                               "SimilarityScore": score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Abhishek_Verma_Lookalike.csv", index=False)

# Final output
print("Lookalike Model complete. Results saved to 'Abhishek_Verma_Lookalike.csv'.")

Lookalike Model complete. Results saved to 'Abhishek_Verma_Lookalike.csv'.


## Examples of Output Validation Based on Output CSV File.

### For CustomerID = C0001, the top 3 similar customers are:

### Conclusion - These scores indicate that C0069 is the most similar customer to C0001, followed by C0120 and C0026

### For CustomerID = C0020, the top 3 similar customers are:

### Conclusion - These scores indicate that C0130 is the most similar customer to C0020, followed by C0140 and C0009

# Thank You !