In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Convert SignupDate & TransactionDate to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Aggregate transaction features
customer_spending = transactions.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "TransactionID": "count",
    "TransactionDate": "max"
}).reset_index()
customer_spending.rename(columns={"TransactionID": "TotalTransactions"}, inplace=True)

# Merge with customer data
customer_profiles = customers.merge(customer_spending, on="CustomerID", how="left").fillna(0)

# Encode categorical features
customer_profiles = pd.get_dummies(customer_profiles, columns=["Region"], drop_first=True)

# Normalize numeric features
scaler = StandardScaler()
numeric_features = ["TotalValue", "TotalTransactions"]
customer_profiles[numeric_features] = scaler.fit_transform(customer_profiles[numeric_features])

# Compute similarity matrix
similarity_matrix = cosine_similarity(customer_profiles[numeric_features])
customer_ids = customer_profiles["CustomerID"].tolist()

# Generate Lookalike.csv
top_lookalikes = {}
for i, cust_id in enumerate(customer_ids[:20]):  # First 20 customers
    sim_scores = list(enumerate(similarity_matrix[i]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 similar customers
    top_lookalikes[cust_id] = [(customer_ids[idx], round(score, 4)) for idx, score in sim_scores]

# Convert dictionary to DataFrame
lookalike_data = []
for cust_id, lookalikes in top_lookalikes.items():
    row = [cust_id]
    for lookalike in lookalikes:
        row.extend(lookalike)
    lookalike_data.append(row)

# Create DataFrame and save to CSV
columns = ["CustomerID", "Lookalike1", "Score1", "Lookalike2", "Score2", "Lookalike3", "Score3"]
lookalike_df = pd.DataFrame(lookalike_data, columns=columns)
lookalike_df.to_csv("Lookalike.csv", index=False)