In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
customers = pd.read_csv("../data/Customers.csv")
products = pd.read_csv("../data/Products.csv")
transactions = pd.read_csv("../data/Transactions.csv")

In [6]:
# Convert date columns to datetime
customers["SignupDate"] = pd.to_datetime(customers["SignupDate"], errors="coerce")
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"], errors="coerce")

# Get latest transaction date for recency/tenure calculations
latest_date = transactions["TransactionDate"].max()

In [7]:
transactions = pd.merge(transactions, products, on="ProductID")
transactions.rename(columns={'Price_x':'Price'}, inplace=True)
transactions.drop('Price_y',axis=1,inplace=True)
transactions["Monetary"] = transactions["Price"]

In [8]:
# RFM Metrics
rfm = transactions.groupby("CustomerID").agg(
    Recency=("TransactionDate", lambda x: (latest_date - x.max()).days),
    Frequency=("TransactionID", "count"),
    Monetary=("Monetary", "sum")
).reset_index()

# Average Transaction Value
rfm["AvgTransactionValue"] = rfm["Monetary"] / rfm["Frequency"]

# Category Preferences (proportion of transactions per category)
category_pivot = pd.pivot_table(
    transactions,
    index="CustomerID",
    columns="Category",
    values="TransactionID",
    aggfunc="count",
    fill_value=0
)
category_proportions = category_pivot.div(category_pivot.sum(axis=1), axis=0)

# Tenure (days since signup to last transaction)
customer_last_transaction = transactions.groupby("CustomerID")["TransactionDate"].max().reset_index()
customer_features = pd.merge(customers, customer_last_transaction, on="CustomerID", how="left")
customer_features["Tenure"] = (customer_features["TransactionDate"] - customer_features["SignupDate"]).dt.days

# Drop unnecessary columns
customer_features = customer_features[["CustomerID", "Region", "Tenure"]]

In [9]:
# Merge all features
final_features = pd.merge(customer_features, rfm, on="CustomerID", how="left")
final_features = pd.merge(final_features, category_proportions, on="CustomerID", how="left")

# One-hot encode Region
final_features = pd.get_dummies(final_features, columns=["Region"], prefix="Region")

# Fill NaN values (for customers with no transactions)
final_features.fillna(0, inplace=True)

In [17]:
final_features

Unnamed: 0,CustomerID,Tenure,Recency,Frequency,Monetary,AvgTransactionValue,Books,Clothing,Electronics,Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,846.0,55.0,5.0,1391.67,278.334000,0.200,0.00,0.600000,0.200000,False,False,False,True
1,C0002,1024.0,25.0,4.0,835.68,208.920000,0.000,0.50,0.000000,0.500000,True,False,False,False
2,C0003,170.0,125.0,4.0,782.83,195.707500,0.000,0.25,0.250000,0.500000,False,False,False,True
3,C0004,806.0,4.0,8.0,1925.09,240.636250,0.375,0.00,0.250000,0.375000,False,False,False,True
4,C0005,812.0,54.0,3.0,874.81,291.603333,0.000,0.00,0.666667,0.333333,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,C0196,922.0,13.0,4.0,1667.97,416.992500,0.250,0.25,0.000000,0.500000,False,True,False,False
196,C0197,647.0,0.0,3.0,681.17,227.056667,0.000,0.00,0.666667,0.333333,False,True,False,False
197,C0198,950.0,84.0,2.0,479.41,239.705000,0.000,0.50,0.500000,0.000000,False,True,False,False
198,C0199,693.0,63.0,4.0,1002.44,250.610000,0.000,0.00,0.500000,0.500000,False,True,False,False


In [11]:
# Separate CustomerID for later
customer_ids = final_features["CustomerID"]
features_to_scale = final_features.drop("CustomerID", axis=1)


scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_to_scale)

In [12]:
similarity_matrix = cosine_similarity(scaled_features)
lookalike_map = {}
target_customers = [f"C{str(i).zfill(4)}" for i in range(1, 21)]  

for idx, cust_id in enumerate(customer_ids):
    if cust_id in target_customers:
        sim_scores = list(enumerate(similarity_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]  
        recommendations = [(customer_ids[i], round(score, 3)) for i, score in sim_scores]
        lookalike_map[cust_id] = recommendations

In [13]:
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_map.keys(),
    "SimilarCustomers": lookalike_map.values()
})

lookalike_df.to_csv("../outputs/Subhadip_Mondal_Lookalike.csv", index=False)
print("..outputs/Subhadip_Mondal_Lookalike.csv saved successfully!")

..outputs/Subhadip_Mondal_Lookalike.csv saved successfully!
