In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [4]:
mergedata= transactions.merge(customers, on='CustomerID', how='left')
mergedata = transaction_details.merge(products, on='ProductID', how='left')


In [5]:
print(mergedata.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                     ProductName_x   Category_x  Price_y  \
0  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1  ComfortLiving Bluetooth Speak

# Aggregate data to create customer profiles

In [6]:
customer_profiles = transaction_details.groupby('CustomerID').agg({
    'Region': 'first', 
    'TotalValue': 'sum',  
    'Quantity': 'sum',  
    'ProductID': 'nunique'  
}).reset_index()

print(customer_profiles.head())

  CustomerID         Region  TotalValue  Quantity  ProductID
0      C0001  South America     3354.52        12          5
1      C0002           Asia     1862.74        10          4
2      C0003  South America     2725.38        14          4
3      C0004  South America     5354.88        23          8
4      C0005           Asia     2034.24         7          3


# Normalize the features

In [8]:
scaler = StandardScaler()
customer_profiles[['TotalValue', 'Quantity']] = scaler.fit_transform(customer_profiles[['TotalValue', 'Quantity']])
print(customer_profiles.head())

  CustomerID         Region  TotalValue  Quantity  ProductID
0      C0001  South America   -0.061701 -0.122033          5
1      C0002           Asia   -0.877744 -0.448000          4
2      C0003  South America   -0.405857  0.203934          4
3      C0004  South America    1.032547  1.670787          8
4      C0005           Asia   -0.783929 -0.936951          3


# Calculate cosine similarity between customers

In [9]:
similarity_matrix = cosine_similarity(customer_profiles[['TotalValue', 'Quantity', 'ProductID']], 
                                      customer_profiles[['TotalValue', 'Quantity', 'ProductID']])
similarities_for_first_20 = similarity_matrix[:20]

# Recommend top 3 lookalikes for first 20 customers

In [10]:
lookalikes = {}
for i in range(20):
    similar_customers = list(enumerate(similarities_for_first_20[i]))
    similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self
    lookalikes[customer_profiles['CustomerID'][i]] = [(customer_profiles['CustomerID'][j[0]], j[1]) for j in similar_customers]
lookalike_df = pd.DataFrame([(key, item[0], item[1]) for key, value in lookalikes.items() for item in value],
                            columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])

lookalike_df.to_csv('Lookalike.csv', index=False)
print(lookalike_df.head())

  CustomerID LookalikeID  SimilarityScore
0      C0001       C0056         0.999824
1      C0001       C0184         0.999718
2      C0001       C0026         0.999705
3      C0002       C0029         0.999950
4      C0002       C0031         0.999827
