In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")

transactions = transactions.merge(products, on="ProductID", how="left")


In [7]:
# Grouping transactions by customerID and creating new fields like total spendings, frequency of purchase and avg val
customer_txn = transactions.groupby("CustomerID").agg(
    TotalSpend=("TotalValue", "sum"),
    Frequency=("TransactionID", "count"),
    AvgOrderValue=("TotalValue", "mean")
).reset_index()

df = customers.merge(customer_txn, on="CustomerID", how="left").fillna(0)


In [8]:
# converting region values from categorical var to numerical var
encoder = OneHotEncoder(sparse_output=False, drop="first")
region_encoded = encoder.fit_transform(df[["Region"]])
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(["Region"]))
df = pd.concat([df.drop(columns=["Region", "CustomerName", "SignupDate"]), region_df], axis=1)

# Scaling all numerical data between 0 and 1 for easy calculations
scaler = StandardScaler()
numerical_cols = ["TotalSpend", "Frequency", "AvgOrderValue"]
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [None]:
similarity_matrix = cosine_similarity(df.drop(columns=["CustomerID"]))
#Similarity between customers is calculated based on their region, total spending amount, frequency of spending and average order value
#These basic attributes can be used to compare customer to customer


In [10]:
lookalikes = {}

for idx, customer_id in enumerate(df["CustomerID"][:20]):  # First 20 customers
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 (excluding self)
    
    top_3 = [(df.iloc[i]["CustomerID"], round(score, 4)) for i, score in sim_scores]
    lookalikes[customer_id] = top_3

lookalike_df = pd.DataFrame(list(lookalikes.items()), columns=["CustomerID", "Lookalikes"])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike Model execution complete. Results saved in Lookalike.csv ✅")


Lookalike Model execution complete. Results saved in Lookalike.csv ✅
