In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


customers.columns = customers.columns.str.strip().str.lower()
products.columns = products.columns.str.strip().str.lower()
transactions.columns = transactions.columns.str.strip().str.lower()


data = transactions.merge(customers, on="customerid", how="left")
data = data.merge(products, on="productid", how="left")


customer_features = data.groupby("customerid").agg({
    "totalvalue": "sum",
    "quantity": "sum",
    #"price": "mean",
    "region": lambda x: x.mode()[0],
    "category": lambda x: x.mode()[0]
}).reset_index()

customer_features = pd.get_dummies(customer_features, columns=["region", "category"], drop_first=True)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop("customerid", axis=1))

similarity_matrix = cosine_similarity(scaled_features)

lookalikes = {}
for i, customer_id in enumerate(customer_features["customerid"][:20]):
    similarity_scores = list(enumerate(similarity_matrix[i]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_lookalikes = [
        (customer_features["customerid"].iloc[j], round(score, 4))
        for j, score in similarity_scores[1:4]  # Skip the first one (self)
    ]
    lookalikes[customer_id] = top_lookalikes

lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": str(lookalike_list)}
    for cust_id, lookalike_list in lookalikes.items()
])
lookalike_df.to_csv("Lookalike.csv", index=False)


print(lookalike_df)


   cust_id                                         lookalikes
0    C0001  [('C0184', 0.9981), ('C0048', 0.9948), ('C0190...
1    C0002  [('C0088', 0.9983), ('C0092', 0.9667), ('C0106...
2    C0003  [('C0076', 0.9815), ('C0052', 0.9714), ('C0031...
3    C0004  [('C0169', 0.9794), ('C0087', 0.9738), ('C0165...
4    C0005  [('C0186', 0.9985), ('C0140', 0.9938), ('C0146...
5    C0006  [('C0126', 0.9929), ('C0187', 0.9921), ('C0011...
6    C0007  [('C0146', 1.0), ('C0115', 0.9921), ('C0005', ...
7    C0008  [('C0160', 0.9221), ('C0059', 0.9148), ('C0079...
8    C0009  [('C0198', 1.0), ('C0061', 0.9756), ('C0062', ...
9    C0010  [('C0062', 0.9768), ('C0111', 0.9742), ('C0103...
10   C0011  [('C0006', 0.9875), ('C0137', 0.9804), ('C0126...
11   C0012  [('C0163', 0.996), ('C0113', 0.9859), ('C0104'...
12   C0013  [('C0099', 0.9875), ('C0108', 0.9764), ('C0107...
13   C0014  [('C0060', 0.9995), ('C0089', 0.9278), ('C0172...
14   C0015  [('C0131', 0.981), ('C0036', 0.9724), ('C0094'...
15   C00