In [29]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [30]:
df_c = pd.read_csv('Customers.csv')
df_p = pd.read_csv('Products.csv')
df_t = pd.read_csv('Transactions.csv')

In [31]:
c_t_merge = df_t.merge(df_c, on="CustomerID")
c_t_p_merge = c_t_merge.merge(df_p, on="ProductID")

In [32]:
c_t_p_merge = c_t_p_merge.drop('Price_y', axis=1)
c_t_p_merge = c_t_p_merge.rename(columns={'Price_x': 'Price'})

In [33]:
customer_features = c_t_p_merge.groupby('CustomerID').agg({
    'TotalValue': 'sum',        
    'Quantity': 'sum',          
    'Price': 'mean',           
    'ProductID': 'nunique',         
    'Category': 'nunique',        
}).reset_index()

In [34]:
customer_features

Unnamed: 0,CustomerID,TotalValue,Quantity,Price,ProductID,Category
0,C0001,3354.52,12,278.334000,5,3
1,C0002,1862.74,10,208.920000,4,2
2,C0003,2725.38,14,195.707500,4,3
3,C0004,5354.88,23,240.636250,8,3
4,C0005,2034.24,7,291.603333,3,2
...,...,...,...,...,...,...
194,C0196,4982.88,12,416.992500,3,3
195,C0197,1928.65,9,227.056667,3,2
196,C0198,931.83,3,239.705000,2,2
197,C0199,1979.28,9,250.610000,4,2


In [35]:
customer_features.columns = [
    'CustomerID', 
    'TotalSpend', 
    'TotalQuantity', 
    'AvgPrice', 
    'ProductVariety', 
    'CategoryVariety'
]

In [36]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:]) 

In [37]:
similarity_matrix = cosine_similarity(scaled_features)

In [38]:
customer_ids = customer_features['CustomerID'].tolist()
customer_index_map = {customer_id: idx for idx, customer_id in enumerate(customer_ids)}

In [39]:
similar = {}  
for customer_id in customer_ids[:20]: 
    idx = customer_index_map[customer_id]
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
  
    highest_similar = [
        (customer_ids[sim_idx], round(sim_score, 4)) 
        for sim_idx, sim_score in similarity_scores[1:4]
    ]
    
    similar[customer_id] = highest_similar

In [40]:
similar_data = {
    'CustomerID': list(similar.keys()),
    'Similar_Customers': [str(lst) for lst in similar.values()]
}


In [41]:
lookalike_df = pd.DataFrame(similar_data)
lookalike_df.to_csv('Gagan_Yadav_Lookalike.csv', index=False)