Import Importance libraries

In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

Load datasets

In [4]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

Step 1: Merge Datasets to Create a Customer Profile With Transaction History :

In [5]:
customer_transactions = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

Aggregate data to create customer-level profiles

In [6]:
customer_profiles = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',       # Total revenue generated by the customer
    'Quantity': 'sum',         # Total quantity purchased by the customer
    'Price_y': 'mean',         # Average price of products purchased
    'ProductID': 'nunique',    # Number of unique products purchased
    'Category': lambda x: ','.join(x)  # Categories of products purchased
}).reset_index()

One-hot encode categories to include in similarity calculation

In [11]:
category_dummies = customer_transactions.groupby('CustomerID')['Category'].apply(lambda x: ','.join(x))
category_dummies = category_dummies.str.get_dummies(sep=',')
print(category_dummies)

            Books  Clothing  Electronics  Home Decor
CustomerID                                          
C0001           1         0            1           1
C0002           0         1            0           1
C0003           0         1            1           1
C0004           1         0            1           1
C0005           0         0            1           1
...           ...       ...          ...         ...
C0196           1         1            0           1
C0197           0         0            1           1
C0198           0         1            1           0
C0199           0         0            1           1
C0200           1         1            1           1

[199 rows x 4 columns]


Combine numeric features and encoded categories

In [10]:
customer_features = pd.concat([
    customer_profiles[['TotalValue', 'Quantity', 'Price_y', 'ProductID']],
    category_dummies
], axis=1)
print(customer_features)

       TotalValue  Quantity     Price_y  ProductID  Books  Clothing  \
0         3354.52      12.0  278.334000        5.0    NaN       NaN   
1         1862.74      10.0  208.920000        4.0    NaN       NaN   
2         2725.38      14.0  195.707500        4.0    NaN       NaN   
3         5354.88      23.0  240.636250        8.0    NaN       NaN   
4         2034.24       7.0  291.603333        3.0    NaN       NaN   
...           ...       ...         ...        ...    ...       ...   
C0196         NaN       NaN         NaN        NaN    1.0       1.0   
C0197         NaN       NaN         NaN        NaN    0.0       0.0   
C0198         NaN       NaN         NaN        NaN    0.0       1.0   
C0199         NaN       NaN         NaN        NaN    0.0       0.0   
C0200         NaN       NaN         NaN        NaN    1.0       1.0   

       Electronics  Home Decor  
0              NaN         NaN  
1              NaN         NaN  
2              NaN         NaN  
3              

Replace NaN values with 0 for all features

In [12]:
customer_features_filled = customer_features.fillna(0)
print(customer_features_filled)

       TotalValue  Quantity     Price_y  ProductID  Books  Clothing  \
0         3354.52      12.0  278.334000        5.0    0.0       0.0   
1         1862.74      10.0  208.920000        4.0    0.0       0.0   
2         2725.38      14.0  195.707500        4.0    0.0       0.0   
3         5354.88      23.0  240.636250        8.0    0.0       0.0   
4         2034.24       7.0  291.603333        3.0    0.0       0.0   
...           ...       ...         ...        ...    ...       ...   
C0196        0.00       0.0    0.000000        0.0    1.0       1.0   
C0197        0.00       0.0    0.000000        0.0    0.0       0.0   
C0198        0.00       0.0    0.000000        0.0    0.0       1.0   
C0199        0.00       0.0    0.000000        0.0    0.0       0.0   
C0200        0.00       0.0    0.000000        0.0    1.0       1.0   

       Electronics  Home Decor  
0              0.0         0.0  
1              0.0         0.0  
2              0.0         0.0  
3              

Normalize numeric features for similarity calculations

In [14]:
scaler = StandardScaler()
customer_features_normalized = scaler.fit_transform(customer_features_filled)
print(customer_features_normalized)

[[ 0.74952582  0.72958459  0.985914   ... -0.71645515 -0.73663808
  -0.72853911]
 [ 0.05969099  0.4702057   0.5055149  ... -0.71645515 -0.73663808
  -0.72853911]
 [ 0.45859641  0.98896347  0.41407408 ... -0.71645515 -0.73663808
  -0.72853911]
 ...
 [-0.80168465 -0.82668874 -0.94037468 ...  1.39576078  1.35751874
  -0.72853911]
 [-0.80168465 -0.82668874 -0.94037468 ... -0.71645515  1.35751874
   1.37260991]
 [-0.80168465 -0.82668874 -0.94037468 ...  1.39576078  1.35751874
   1.37260991]]


Step 2: Calculate cosine similarity between customers

In [15]:
similarity_matrix = cosine_similarity(customer_features_normalized)
print(similarity_matrix)

[[ 1.          0.92027462  0.94146667 ... -0.58920897 -0.59504944
  -0.95770131]
 [ 0.92027462  1.          0.94417065 ... -0.45537197 -0.46135508
  -0.973     ]
 [ 0.94146667  0.94417065  1.         ... -0.52019054 -0.52607437
  -0.9608045 ]
 ...
 [-0.58920897 -0.45537197 -0.52019054 ...  1.          0.43477101
   0.52726989]
 [-0.59504944 -0.46135508 -0.52607437 ...  0.43477101  1.
   0.52283475]
 [-0.95770131 -0.973      -0.9608045  ...  0.52726989  0.52283475
   1.        ]]


Step 3: Find top 3 similar customers for the first 20 customers (C0001-C0020)

In [17]:
customer_ids = customer_profiles['CustomerID']
top_lookalikes = {}
print(customer_ids)

0      C0001
1      C0002
2      C0003
3      C0004
4      C0005
       ...  
194    C0196
195    C0197
196    C0198
197    C0199
198    C0200
Name: CustomerID, Length: 199, dtype: object


 Get similarity scores for the current customer,
Sort customers by similarity score (excluding the customer itself),
 Store the top 3 similar customers with their scores

In [18]:
for idx, customer_id in enumerate(customer_ids[:20]):
    scores = similarity_matrix[idx]
    similar_customers = np.argsort(scores)[::-1][1:4]  # Exclude the current customer
    top_lookalikes[customer_id] = [(customer_ids[sim_idx], scores[sim_idx]) for sim_idx in similar_customers]
     

Step 4: Save results to a CSV file

In [19]:
lookalike_results = []

In [None]:
for cust_id, lookalikes in top_lookalikes.items():
    for similar_cust_id, score in lookalikes:
        lookalike_results.append({"CustomerID": cust_id, "SimilarCustomerID": similar_cust_id, "Score": score})

In [21]:
lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv("Lookalike.csv", index=False)
print(lookalike_df)

Empty DataFrame
Columns: []
Index: []


In [None]:
print("Lookalike model results saved to Lookalike.csv")
     