In [2]:
import pandas as pd

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


In [3]:
customer_features = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean')
)

# One-hot encode product categories
category_preferences = pd.crosstab(merged_data['CustomerID'], merged_data['Category'])

# Combine features
customer_profiles = pd.concat([customer_features, category_preferences], axis=1)
print(customer_profiles.head())


            total_spent  transaction_count  avg_transaction_value  Books  \
CustomerID                                                                 
C0001           3354.52                  5                670.904      1   
C0002           1862.74                  4                465.685      0   
C0003           2725.38                  4                681.345      0   
C0004           5354.88                  8                669.360      3   
C0005           2034.24                  3                678.080      0   

            Clothing  Electronics  Home Decor  
CustomerID                                     
C0001              0            3           1  
C0002              2            0           2  
C0003              1            1           2  
C0004              0            2           3  
C0005              0            2           1  


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
customer_profiles_scaled = scaler.fit_transform(customer_profiles)


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(customer_profiles_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles.index, columns=customer_profiles.index)


In [14]:
lookalikes = {}
for customer_id in customer_profiles.index[:20]:  # First 20 customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]
    lookalikes[customer_id] = [(sim_id, sim_score) for sim_id, sim_score in similar_customers.items()]

# Create the final DataFrame for Lookalike.csv
lookalike_df = pd.DataFrame(list(lookalike_data.items()), columns=['CustomerID', 'Top_3_Lookalikes'])

# Save the results to Lookalike.csv
lookalike_df.to_csv('Kuruba_Bhavyasree_Lookalike.csv', index=False)

# Show the results
lookalike_df.head()


Unnamed: 0,CustomerID,Top_3_Lookalikes
0,C0001,"[(C0069, 0.9407645268782764), (C0127, 0.846413..."
1,C0002,"[(C0103, 0.9042642927981942), (C0062, 0.899619..."
2,C0003,"[(C0166, 0.9735807428089185), (C0031, 0.891795..."
3,C0004,"[(C0122, 0.9053011527825046), (C0113, 0.901179..."
4,C0005,"[(C0197, 0.9973329264482712), (C0007, 0.926480..."
