In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv('D:/Zeotap Assignment/Customers.csv')
products = pd.read_csv('D:/Zeotap Assignment/Products.csv')
transactions = pd.read_csv('D:/Zeotap Assignment/Transactions.csv')

In [2]:
# Encode categorical features in customers
label_enc = LabelEncoder()
customers["Region"] = label_enc.fit_transform(customers["Region"])

<IPython.core.display.Javascript object>

In [3]:
# Convert SignupDate to numerical feature (days since first signup)
customers["SignupDate"] = pd.to_datetime(customers["SignupDate"])
min_date = customers["SignupDate"].min()
customers["DaysSinceSignup"] = (customers["SignupDate"] - min_date).dt.days

In [4]:
# Aggregate transaction data
customer_spending = transactions.groupby("CustomerID")["TotalValue"].sum().reset_index()
customer_spending.columns = ["CustomerID", "TotalSpent"]

In [5]:
# Merge customer data with spending data
customers = customers.merge(customer_spending, on="CustomerID", how="left")
customers["TotalSpent"].fillna(0, inplace=True)

In [6]:
# Normalize features
features = customers[["Region", "DaysSinceSignup", "TotalSpent"]]
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

<IPython.core.display.Javascript object>

In [9]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(features_scaled)

In [10]:
# Get top 3 similar customers for first 20 customers
lookalike_map = {}

for i in range(20):  # For first 20 customers
    cust_id = customers.iloc[i]["CustomerID"]
    sim_scores = list(enumerate(similarity_matrix[i]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 excluding self
    lookalike_map[cust_id] = [(customers.iloc[j]["CustomerID"], round(score, 4)) for j, score in sim_scores]

In [11]:
# Save as Lookalike.csv
lookalike_df = pd.DataFrame(
    [(key, str(value)) for key, value in lookalike_map.items()],
    columns=["CustomerID", "Lookalikes"]
)
lookalike_df.to_csv("Lookalike.csv", index=False)

In [12]:
print("Lookalike.csv generated successfully!")

Lookalike.csv generated successfully!
