In [1]:
import pandas as pd


customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [2]:

transaction_summary = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum', 
    'Quantity': 'sum', 
    'ProductID': 'nunique'
}).reset_index()


transaction_summary.rename(columns={
    'TotalValue': 'TotalSpending', 
    'Quantity': 'TotalQuantity', 
    'ProductID': 'UniqueProducts'
}, inplace=True)


merged_data = pd.merge(customers, transaction_summary, on='CustomerID', how='left')


merged_data.fillna(0, inplace=True)


In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


features = ['TotalSpending', 'TotalQuantity', 'UniqueProducts']
customer_profiles = merged_data[features].values


similarity_matrix = cosine_similarity(customer_profiles)


lookalikes = {}
for idx, customer_id in enumerate(merged_data['CustomerID'][:20]):
   
    scores = similarity_matrix[idx]
    
    top_indices = np.argsort(-scores)[1:4]
    similar_customers = [(merged_data.iloc[i]['CustomerID'], scores[i]) for i in top_indices]
    lookalikes[customer_id] = similar_customers


lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': str(similar_list)} 
    for cust_id, similar_list in lookalikes.items()
])

lookalike_df.to_csv("Lookalike.csv", index=False)
