# Task 2: Lookalike Model
Build a Lookalike Model that takes a user's information as input and recommends 3 similar customers based on their profile and transaction history. The model should:

● Use both customer and product information.

● Assign a similarity score to each recommended customer.

# Deliverables:

● Give the top 3 lookalikes with there similarity scores for the first 20 customers (CustomerID: C0001 - C0020) in Customers.csv. Form an “Lookalike.csv” which has just one map: Map<cust_id, List<cust_id, score>>

# Evaluation Criteria:

● Model accuracy and logic.

● Quality of recommendations and similarity scores.

In [11]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Setting up seaborn style
sns.set(style="whitegrid")

# Loading datasets
customers = pd.read_csv(r'/content/Customers.csv')
products = pd.read_csv(r'/content/Products.csv')
transactions = pd.read_csv(r'/content/Transactions.csv')


In [12]:
# Merging datasets
data = transactions.merge(customers, on='CustomerID', how='left') \
                   .merge(products, on='ProductID', how='left')


In [28]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,CustomerName,Region,SignupDate,ProductName,Category,Price
0,T00001,C0199,P067,25-08-2024 12:38,1,300.68,Andrea Jenkins,Europe,03-12-2022,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,27-05-2024 22:23,1,300.68,Brittany Harvey,Asia,04-09-2024,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,25-04-2024 07:38,1,300.68,Kathryn Stevens,Europe,04-04-2024,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,26-03-2024 22:55,2,601.36,Travis Campbell,South America,11-04-2024,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,21-03-2024 15:10,3,902.04,Timothy Perez,Europe,15-03-2022,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [13]:
data['Price'] = data['Price_x']  # Retain Price_x
data.drop(['Price_x', 'Price_y'], axis=1, inplace=True)  # Drop duplicate columns

# Display the updated columns
print("Updated Columns:", data.columns)

# Display the first 5 rows of the cleaned dataset
print(data.head())

Updated Columns: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'CustomerName', 'Region', 'SignupDate',
       'ProductName', 'Category', 'Price'],
      dtype='object')
  TransactionID CustomerID ProductID   TransactionDate  Quantity  TotalValue  \
0        T00001      C0199      P067  25-08-2024 12:38         1      300.68   
1        T00112      C0146      P067  27-05-2024 22:23         1      300.68   
2        T00166      C0127      P067  25-04-2024 07:38         1      300.68   
3        T00272      C0087      P067  26-03-2024 22:55         2      601.36   
4        T00363      C0070      P067  21-03-2024 15:10         3      902.04   

      CustomerName         Region  SignupDate  \
0   Andrea Jenkins         Europe  03-12-2022   
1  Brittany Harvey           Asia  04-09-2024   
2  Kathryn Stevens         Europe  04-04-2024   
3  Travis Campbell  South America  11-04-2024   
4    Timothy Perez         Europe  15-03-2022   

In [14]:
print(data.columns)


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'CustomerName', 'Region', 'SignupDate',
       'ProductName', 'Category', 'Price'],
      dtype='object')


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Feature Engineering: Aggregate transaction data by customer
customer_profile = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Category': lambda x: ','.join(x.unique()),  # List of unique categories purchased
    'Region': 'first',  # Region of the customer
}).reset_index()


In [21]:
customer_profile

Unnamed: 0,CustomerID,TotalValue,Quantity,Category,Region
0,C0001,3354.52,12,"Books,Home Decor,Electronics",South America
1,C0002,1862.74,10,"Home Decor,Clothing",Asia
2,C0003,2725.38,14,"Home Decor,Clothing,Electronics",South America
3,C0004,5354.88,23,"Books,Home Decor,Electronics",South America
4,C0005,2034.24,7,"Home Decor,Electronics",Asia
...,...,...,...,...,...
194,C0196,4982.88,12,"Books,Clothing,Home Decor",Europe
195,C0197,1928.65,9,"Home Decor,Electronics",Europe
196,C0198,931.83,3,"Electronics,Clothing",Europe
197,C0199,1979.28,9,"Electronics,Home Decor",Europe


In [22]:
# Encode categorical features (Region and Category)
encoded_data = pd.get_dummies(customer_profile, columns=['Region'], prefix='Region')
encoded_data = pd.concat([encoded_data, pd.get_dummies(encoded_data['Category'].str.split(',').apply(pd.Series).stack()).groupby(level=0).sum()], axis=1)
encoded_data.drop('Category', axis=1, inplace=True)

# Normalize numerical features for similarity calculation
scaler = StandardScaler()
numerical_features = ['TotalValue', 'Quantity']
encoded_data[numerical_features] = scaler.fit_transform(encoded_data[numerical_features])

# Compute cosine similarity
similarity_matrix = cosine_similarity(encoded_data.drop('CustomerID', axis=1))


In [23]:
similarity_matrix

array([[1.        , 0.27756473, 0.72971015, ..., 0.23386306, 0.52759751,
        0.60001707],
       [0.27756473, 1.        , 0.55416148, ..., 0.53860278, 0.49662974,
        0.44722717],
       [0.72971015, 0.55416148, 1.        , ..., 0.39999446, 0.53535157,
        0.5722116 ],
       ...,
       [0.23386306, 0.53860278, 0.39999446, ..., 1.        , 0.74776112,
        0.02718021],
       [0.52759751, 0.49662974, 0.53535157, ..., 0.74776112, 1.        ,
        0.22801657],
       [0.60001707, 0.44722717, 0.5722116 , ..., 0.02718021, 0.22801657,
        1.        ]])

In [15]:
# Create recommendations for the first 20 customers
top_n = 3
recommendations = {}
customer_ids = encoded_data['CustomerID']

for idx, customer_id in enumerate(customer_ids[:20]):  # Loop through the first 20 customers
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_similar = [(customer_ids[i], score) for i, score in similarity_scores[1:top_n + 1]]
    recommendations[customer_id] = top_similar

# Save recommendations to Lookalike.csv
lookalike_data = pd.DataFrame([
    {'cust_id': cust_id, 'lookalikes': [{'cust_id': sim_id, 'score': sim_score} for sim_id, sim_score in lookalikes]}
    for cust_id, lookalikes in recommendations.items()
])
lookalike_data.to_csv('Lookalike.csv', index=False)

print("Lookalike Model recommendations saved to Lookalike.csv!")


Lookalike Model recommendations saved to Lookalike.csv!


In [26]:
lookalike_data.head()

Unnamed: 0,cust_id,lookalikes
0,C0001,"[{'cust_id': 'C0174', 'score': 0.9921950933519..."
1,C0002,"[{'cust_id': 'C0159', 'score': 0.9856803265611..."
2,C0003,"[{'cust_id': 'C0031', 'score': 0.9381777182596..."
3,C0004,"[{'cust_id': 'C0012', 'score': 0.9780970207655..."
4,C0005,"[{'cust_id': 'C0140', 'score': 0.9922482880187..."
