In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from datetime import datetime

In [6]:
# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

#  Feature Engineering
# Merge customer and transaction data
merged_data = pd.merge(transactions, customers, on='CustomerID')

In [7]:
# Create customer profiles
customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'TransactionID': 'nunique',
    'Region': 'first',
    'SignupDate': 'first'
}).reset_index()

# Calculate average order value
customer_profiles['AvgOrderValue'] = customer_profiles['TotalValue'] / customer_profiles['Quantity']

# Convert SignupDate to days since signup
customer_profiles['DaysSinceSignup'] = (datetime.now() - pd.to_datetime(customer_profiles['SignupDate'])).dt.days

# One-hot encode Region
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'])

In [8]:
#  Normalize Features
# Select numerical features for normalization
numerical_features = ['TotalValue', 'Quantity', 'TransactionID', 'AvgOrderValue', 'DaysSinceSignup']
scaler = StandardScaler()
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

In [9]:
#  Calculate Similarity Scores
# Exclude non-numeric columns (CustomerID and SignupDate) before computing similarity
similarity_matrix = cosine_similarity(customer_profiles[numerical_features + list(customer_profiles.columns[customer_profiles.columns.str.startswith('Region_')])])

In [10]:
#  Generate Recommendations
lookalike_map = {}
for i in range(20):  # First 20 customers
    customer_id = customer_profiles.iloc[i]['CustomerID']
    similarity_scores = similarity_matrix[i]
    top_3_indices = similarity_scores.argsort()[-4:-1][::-1]  # Exclude self
    top_3_customers = customer_profiles.iloc[top_3_indices]['CustomerID'].values
    top_3_scores = similarity_scores[top_3_indices]
    lookalike_map[customer_id] = list(zip(top_3_customers, top_3_scores))

In [11]:
#  Save Results to CSV
with open('Lookalike.csv', 'w', newline='') as file:
    file.write('CustomerID,LookalikeCustomerID,SimilarityScore\n')
    for cust_id, lookalikes in lookalike_map.items():
        for lookalike_id, score in lookalikes:
            file.write(f'{cust_id},{lookalike_id},{score}\n')

print("Lookalike recommendations saved to Lookalike.csv")

Lookalike recommendations saved to Lookalike.csv
