In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
import pandas as pd

# Load the Transactions dataset

transactions_file = r"D:\Company assignments\Zeotap_Data_science_assignment\data\Transactions.csv"
transactions = pd.read_csv(transactions_file)

customers_file = r"D:\Company assignments\Zeotap_Data_science_assignment\data\Customers.csv"
customers = pd.read_csv(customers_file)

### # Feature Engineering

In [4]:
# 1. Create a total spend per customer
customer_spending = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spending = customer_spending.rename(columns={'TotalValue': 'TotalSpend'})

In [5]:
# 2. Create a product purchase history by customer
product_purchase = transactions.groupby(['CustomerID', 'ProductID'])['Quantity'].sum().unstack(fill_value=0)
product_purchase = product_purchase.reset_index()

In [6]:
# 3. Combine customer demographic info and transaction data
customer_data = pd.merge(customers, customer_spending, on='CustomerID', how='left')

# We will also merge product purchase history with customer data
customer_data = pd.merge(customer_data, product_purchase, on='CustomerID', how='left')

# Handle missing values and scale numerical features
customer_data = customer_data.fillna(0)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_data[['TotalSpend'] + list(product_purchase.columns[1:])])

# Compute cosine similarity
cos_sim_matrix = cosine_similarity(scaled_features)

In [7]:
# Extract top 3 lookalike customers for the first 20 customers
lookalikes = {}
for customer_idx in range(20):
    sim_scores = cos_sim_matrix[customer_idx]
    similar_indices = sim_scores.argsort()[-4:-1][::-1]  # Top 3 similar excluding itself
    lookalikes[customer_data['CustomerID'][customer_idx]] = [
        (customer_data['CustomerID'][i], sim_scores[i]) for i in similar_indices
    ]

In [8]:
# Convert to a DataFrame
lookalike_df = []
for customer_id, similar_customers in lookalikes.items():
    for similar_customer, score in similar_customers:
        lookalike_df.append([customer_id, similar_customer, score])

lookalike_df = pd.DataFrame(lookalike_df, columns=["CustomerID", "LookalikeID", "SimilarityScore"])

In [9]:
# Save the result as Lookalike.csv
lookalike_df.to_csv("Lookalike_Model.csv", index=False)