In [24]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from scipy.sparse.linalg import svds

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [25]:
# Preprocessing: Aggregate transaction data
transactions = transactions.merge(products, on="ProductID", suffixes=('', '_product'))
transactions['TotalValue'] = transactions['TotalValue']
customer_summary = transactions.groupby("CustomerID").agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: x.mode()[0] if len(x.mode()) > 0 else None
}).reset_index()
customer_summary.rename(columns={
    'TotalValue': 'TotalSpend',
    'TransactionID': 'TransactionCount',
    'Category': 'FavoriteCategory'
}, inplace=True)

In [26]:
# Merge with customer data
customer_data = customers.merge(customer_summary, on="CustomerID", how="left")
customer_data["SignupYear"] = pd.to_datetime(customer_data["SignupDate"]).dt.year

In [27]:
# One-hot encoding for categorical features
customer_data = pd.get_dummies(customer_data, columns=["Region", "FavoriteCategory"], drop_first=True)

We perform one-hot encoding on the categorical columns Region and FavoriteCategory to convert them into binary (0/1) features. This is necessary for machine learning algorithms, which do not natively handle categorical data

In [28]:
# Fill missing values
customer_data.fillna(0, inplace=True)


In [29]:
# Feature selection
features = ["TotalSpend", "TransactionCount"] + [col for col in customer_data.columns if "Region_" in col or "FavoriteCategory_" in col]
X = customer_data[features]



We select the features that will be used in our model to represent our customer.
- TotalSpend, TransactionCount represent important customer behavior metrics on their purchases.
- The one-hot encoded columns for Region and FavoriteCategory represent categorical characteristics of the customers.

In [None]:
# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 1: Dimensionality Reduction with SVD
U, sigma, Vt = svds(X_scaled, k=5)  # Reduce to 5 latent dimensions
latent_features = U @ np.diag(sigma)

# Step 2: Clustering in Latent Space
kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust number of clusters as needed
customer_data["Cluster"] = kmeans.fit_predict(latent_features)

- To ensure that all features contribute equally to the model, we normalize
- We apply SVD to reduce the dimensionality of the dataset. SVD decomposes the dataset into three matrices, and by retaining only the top k=5 singular values.- we reduce the number of features while preserving the most significant patterns.
- latent_features represents a compressed version of the original data, which can be used for clustering.



In [30]:
# Step 3: Compute Similarity within Clusters
lookalike_map = {}
customer_ids = customer_data["CustomerID"].values
for i, customer_id in enumerate(customer_ids[:20]):
    cluster = customer_data.loc[customer_data["CustomerID"] == customer_id, "Cluster"].values[0]
    cluster_members = customer_data[customer_data["Cluster"] == cluster]
    similarities = cosine_similarity([latent_features[i]], latent_features[cluster_members.index])[0]
    cluster_members["Similarity"] = similarities
    top_3 = cluster_members.nlargest(4, "Similarity")[1:4][["CustomerID", "Similarity"]]  # Exclude self
    lookalike_map[customer_id] = [(row.CustomerID, round(row.Similarity, 4)) for _, row in top_3.iterrows()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_members["Similarity"] = similarities
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_members["Similarity"] = similarities
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_members["Similarity"] = similarities
A value is trying to be set on a copy of a slice from a DataFrame.
Try

In [31]:
# Save to Lookalike.csv
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_map.keys(),
    "Lookalikes": [str(v) for v in lookalike_map.values()]
})
lookalike_df.to_csv("Lookalike.csv", index=False)

# Print a sample of the map
print(lookalike_df.head())

  CustomerID                                         Lookalikes
0      C0001  [('C0190', 0.9985), ('C0048', 0.9973), ('C0091...
1      C0002  [('C0088', 0.9825), ('C0092', 0.9813), ('C0040...
2      C0003  [('C0052', 0.9995), ('C0076', 0.9981), ('C0031...
3      C0004  [('C0087', 0.9991), ('C0155', 0.9948), ('C0082...
4      C0005  [('C0186', 0.9993), ('C0007', 0.9966), ('C0140...
