In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [9]:
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")

In [2]:
merged_data = pd.read_csv("/content/Cleaned_Merged_Data.csv")

In [3]:
merged_data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y,Year,Month,CustomerTenure
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024,8,777.0
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024,5,834.0
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024,4,49.0
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024,3,534.0
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024,3,584.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,Jacob Holt,South America,2022-01-22,SoundWave Smartwatch,Electronics,459.86,2024,10,
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,Mrs. Kimberly Wright,North America,2024-04-07,SoundWave Smartwatch,Electronics,459.86,2024,6,
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,Tyler Haynes,North America,2024-09-21,SoundWave Smartwatch,Electronics,459.86,2024,4,
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Joshua Hamilton,Asia,2024-11-11,SoundWave Smartwatch,Electronics,459.86,2024,9,


In [5]:
merged_data.rename(columns={"Price_x": "TransactionPrice", "Price_y": "ProductPrice"}, inplace=True)

In [6]:
merged_data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,TransactionPrice,CustomerName,Region,SignupDate,ProductName,Category,ProductPrice,Year,Month,CustomerTenure
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024,8,777.0
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024,5,834.0
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024,4,49.0
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024,3,534.0
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024,3,584.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,Jacob Holt,South America,2022-01-22,SoundWave Smartwatch,Electronics,459.86,2024,10,
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,Mrs. Kimberly Wright,North America,2024-04-07,SoundWave Smartwatch,Electronics,459.86,2024,6,
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,Tyler Haynes,North America,2024-09-21,SoundWave Smartwatch,Electronics,459.86,2024,4,
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Joshua Hamilton,Asia,2024-11-11,SoundWave Smartwatch,Electronics,459.86,2024,9,


In [7]:
# Aggregating customer-level features
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "Quantity": "sum",    # Total quantity purchased
    "ProductID": "count", # Number of transactions
    "Category": lambda x: x.mode()[0],  # Most frequent category
    "TransactionPrice": "mean"          # Average transaction price
}).reset_index()

In [23]:
customer_features

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductID,TransactionPrice,CustomerName,SignupDate,Region_Europe,Region_North America,Region_South America,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,-0.061701,-0.122033,-0.011458,0.094670,Lawrence Carroll,2022-07-10,False,False,True,False,True,False
1,C0002,-0.877744,-0.448000,-0.467494,-0.904016,Elizabeth Lutz,2022-02-13,False,False,False,True,False,False
2,C0003,-0.405857,0.203934,-0.467494,-1.094109,Michael Rivera,2024-03-07,False,False,True,False,False,True
3,C0004,1.032547,1.670787,1.356650,-0.447702,Kathleen Rodriguez,2022-10-09,False,False,True,False,False,False
4,C0005,-0.783929,-0.936951,-0.923530,0.285581,Laura Weber,2022-08-15,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,C0196,0.829053,-0.122033,-0.467494,2.089604,Laura Watts,2022-06-07,True,False,False,False,False,True
195,C0197,-0.841689,-0.610984,-0.923530,-0.643077,Christina Harvey,2023-03-21,True,False,False,False,True,False
196,C0198,-1.386975,-1.588886,-1.379566,-0.461100,Rebecca Ray,2022-02-27,True,False,False,True,False,False
197,C0199,-0.813993,-0.610984,-0.467494,-0.304206,Andrea Jenkins,2022-12-03,True,False,False,False,True,False


In [10]:
# Add demographic features from the Customers dataset
customer_features = customer_features.merge(customers, on="CustomerID")

In [34]:
customer_features

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductID,TransactionPrice,CustomerName,SignupDate,Region_Europe,Region_North America,Region_South America,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,-0.061701,-0.122033,-0.011458,0.094670,Lawrence Carroll,2022-07-10,False,False,True,False,True,False
1,C0002,-0.877744,-0.448000,-0.467494,-0.904016,Elizabeth Lutz,2022-02-13,False,False,False,True,False,False
2,C0003,-0.405857,0.203934,-0.467494,-1.094109,Michael Rivera,2024-03-07,False,False,True,False,False,True
3,C0004,1.032547,1.670787,1.356650,-0.447702,Kathleen Rodriguez,2022-10-09,False,False,True,False,False,False
4,C0005,-0.783929,-0.936951,-0.923530,0.285581,Laura Weber,2022-08-15,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,C0196,0.829053,-0.122033,-0.467494,2.089604,Laura Watts,2022-06-07,True,False,False,False,False,True
195,C0197,-0.841689,-0.610984,-0.923530,-0.643077,Christina Harvey,2023-03-21,True,False,False,False,True,False
196,C0198,-1.386975,-1.588886,-1.379566,-0.461100,Rebecca Ray,2022-02-27,True,False,False,True,False,False
197,C0199,-0.813993,-0.610984,-0.467494,-0.304206,Andrea Jenkins,2022-12-03,True,False,False,False,True,False


In [11]:
# Normalize numerical features for similarity calculation
scaler = StandardScaler()
numerical_cols = ["TotalValue", "Quantity", "ProductID", "TransactionPrice"]
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

In [12]:
# Encode categorical variables (e.g., Region, Category)
customer_features = pd.get_dummies(customer_features, columns=["Region", "Category"], drop_first=True)

In [32]:
customer_features

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductID,TransactionPrice,CustomerName,SignupDate,Region_Europe,Region_North America,Region_South America,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,-0.061701,-0.122033,-0.011458,0.094670,Lawrence Carroll,2022-07-10,False,False,True,False,True,False
1,C0002,-0.877744,-0.448000,-0.467494,-0.904016,Elizabeth Lutz,2022-02-13,False,False,False,True,False,False
2,C0003,-0.405857,0.203934,-0.467494,-1.094109,Michael Rivera,2024-03-07,False,False,True,False,False,True
3,C0004,1.032547,1.670787,1.356650,-0.447702,Kathleen Rodriguez,2022-10-09,False,False,True,False,False,False
4,C0005,-0.783929,-0.936951,-0.923530,0.285581,Laura Weber,2022-08-15,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,C0196,0.829053,-0.122033,-0.467494,2.089604,Laura Watts,2022-06-07,True,False,False,False,False,True
195,C0197,-0.841689,-0.610984,-0.923530,-0.643077,Christina Harvey,2023-03-21,True,False,False,False,True,False
196,C0198,-1.386975,-1.588886,-1.379566,-0.461100,Rebecca Ray,2022-02-27,True,False,False,True,False,False
197,C0199,-0.813993,-0.610984,-0.467494,-0.304206,Andrea Jenkins,2022-12-03,True,False,False,False,True,False


In [13]:
# Computed cosine similarity for all
feature_matrix = customer_features.drop(columns=["CustomerID", "CustomerName", "SignupDate"]).values
similarity_matrix = cosine_similarity(feature_matrix)

In [26]:
# top-3 similar customers for the first 20 customers (C0001-C0020)
customer_ids = customer_features["CustomerID"].values[:20]  # Select only the first 20 customers
similarity_matrix_subset = similarity_matrix[:20, :]  # Slice the similarity matrix for the first 20 customers

In [27]:
customer_ids

array(['C0001', 'C0002', 'C0003', 'C0004', 'C0005', 'C0006', 'C0007',
       'C0008', 'C0009', 'C0010', 'C0011', 'C0012', 'C0013', 'C0014',
       'C0015', 'C0016', 'C0017', 'C0018', 'C0019', 'C0020'], dtype=object)

In [28]:
similarity_matrix_subset

array([[ 1.        ,  0.01158307,  0.33280652, ...,  0.06038232,
         0.4227504 , -0.03783943],
       [ 0.01158307,  1.        ,  0.44619905, ...,  0.78646749,
         0.46707677, -0.07266188],
       [ 0.33280652,  0.44619905,  1.        , ...,  0.24927496,
         0.21746276, -0.21419784],
       ...,
       [-0.07961429, -0.29682875,  0.05604602, ..., -0.50017357,
        -0.36903105,  0.41544539],
       [ 0.01528338,  0.38611162,  0.19030112, ...,  0.49127127,
         0.65145445, -0.41823892],
       [ 0.2982471 ,  0.42239927,  0.08878485, ...,  0.72600362,
         0.6546982 , -0.36952185]])

In [33]:
lookalike_map = {}
for i, customer_id in enumerate(customer_ids):
    # Get similarity scores for the current customer
    similarity_scores = similarity_matrix_subset[i, :]

    # Rank customers by similarity, excluding the current customer
    similar_indices = np.argsort(-similarity_scores)  # Sort in descending order
    similar_indices = similar_indices[similar_indices != i][:3]  # Exclude self and take top-3


    # Retrieve similar customers and their similarity scores
    similar_customers = [
        (customer_features["CustomerID"].iloc[idx], similarity_scores[idx]) for idx in similar_indices
    ]
    lookalike_map[customer_id] = similar_customers

# Convert the lookalike map into a DataFrame and save it
lookalike_data = []
for cust_id, lookalikes in lookalike_map.items():
    for lookalike, score in lookalikes:
        lookalike_data.append({"CustomerID": cust_id, "SimilarCustomerID": lookalike, "Score": score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Lookalike.csv", index=False)

In [30]:
lookalike = pd.read_csv("/content/Lookalike.csv")

In [31]:
lookalike

Unnamed: 0,CustomerID,SimilarCustomerID,Score
0,C0001,C0181,0.892036
1,C0001,C0190,0.82765
2,C0001,C0192,0.820003
3,C0002,C0088,0.950441
4,C0002,C0077,0.907828
5,C0002,C0144,0.886983
6,C0003,C0031,0.872445
7,C0003,C0025,0.867928
8,C0003,C0052,0.854926
9,C0004,C0165,0.983761
