In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
# Merge datasets
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

# Create customer profiles
customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Region': 'first',
    'Category': lambda x: x.mode()[0]  # Most purchased category
}).reset_index()

In [4]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# One-hot encode categorical variables
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_profiles[['Region', 'Category']]).toarray()

# Normalize numeric features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_profiles[['TotalValue', 'Quantity']])

# Combine features
features = np.hstack([scaled_features, encoded_features])

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
similarity_matrix = cosine_similarity(features)

# Function to get top 3 lookalikes
def get_lookalikes(customer_index, similarity_matrix, top_n=3):
    similarities = similarity_matrix[customer_index]
    top_indices = similarities.argsort()[-top_n-1:-1][::-1]
    return [(customer_profiles.iloc[i]['CustomerID'], similarities[i]) for i in top_indices]

# Generate lookalikes for the first 20 customers
lookalike_map = {}
for i in range(20):
    customer_id = customer_profiles.iloc[i]['CustomerID']
    lookalike_map[customer_id] = get_lookalikes(i, similarity_matrix)

# Save to CSV
import csv
with open('Lookalike.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'LookalikeID', 'SimilarityScore'])
    for cust_id, lookalikes in lookalike_map.items():
        for lookalike_id, score in lookalikes:
            writer.writerow([cust_id, lookalike_id, score])