In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Merge datasets
data = pd.merge(transactions, customers, on='CustomerID')
data = pd.merge(data, products, on='ProductID')

# Feature Engineering
# Total revenue per customer
customer_revenue = data.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_revenue.rename(columns={'TotalValue': 'TotalRevenue'}, inplace=True)

# Number of transactions per customer
customer_transactions = data.groupby('CustomerID')['TransactionID'].count().reset_index()
customer_transactions.rename(columns={'TransactionID': 'NumTransactions'}, inplace=True)

# Average transaction value per customer
customer_avg_value = data.groupby('CustomerID')['TotalValue'].mean().reset_index()
customer_avg_value.rename(columns={'TotalValue': 'AvgTransactionValue'}, inplace=True)

# Preferred product category per customer
preferred_category = data.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
preferred_category = preferred_category.loc[preferred_category.groupby('CustomerID')['Count'].idxmax()]
preferred_category = preferred_category[['CustomerID', 'Category']]

# Merge features into a single DataFrame
customer_features = pd.merge(customers, customer_revenue, on='CustomerID')
customer_features = pd.merge(customer_features, customer_transactions, on='CustomerID')
customer_features = pd.merge(customer_features, customer_avg_value, on='CustomerID')
customer_features = pd.merge(customer_features, preferred_category, on='CustomerID')

# Encode categorical variables
encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(customer_features[['Region', 'Category']]).toarray()

# Scale numerical features
scaler = StandardScaler()
numerical_features = scaler.fit_transform(customer_features[['TotalRevenue', 'NumTransactions', 'AvgTransactionValue']])

# Combine numerical and categorical features into a single matrix
features = pd.concat([pd.DataFrame(numerical_features), pd.DataFrame(categorical_features)], axis=1)

# Compute similarity scores
similarity_matrix = cosine_similarity(features)

# Get top 3 lookalikes for the first 20 customers
lookalike_map = {}
for i, customer_id in enumerate(customer_features['CustomerID'][:20]):
    # Get similarity scores for the current customer
    scores = list(enumerate(similarity_matrix[i]))
    # Sort by score in descending order, exclude self (score = 1)
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]
    # Map top 3 lookalikes with similarity scores
    lookalike_map[customer_id] = [(customer_features['CustomerID'][j], round(score, 2)) for j, score in scores]

# Create a DataFrame for better output formatting
lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])

# Print the DataFrame
print(lookalike_df)

# Save the lookalike map to a CSV file
lookalike_df.to_csv('Lookalike.csv', index_label='CustomerID')

          Lookalike1     Lookalike2     Lookalike3
C0001  (C0190, 0.97)  (C0048, 0.94)  (C0181, 0.91)
C0002  (C0088, 0.97)  (C0134, 0.94)   (C0106, 0.9)
C0003  (C0052, 0.98)  (C0152, 0.93)  (C0031, 0.89)
C0004  (C0165, 0.97)  (C0155, 0.96)  (C0169, 0.89)
C0005  (C0186, 0.98)  (C0146, 0.96)   (C0007, 0.9)
C0006  (C0168, 0.97)  (C0171, 0.95)  (C0187, 0.94)
C0007  (C0140, 0.98)  (C0115, 0.93)   (C0005, 0.9)
C0008  (C0109, 0.87)  (C0139, 0.81)  (C0098, 0.79)
C0009  (C0010, 0.98)  (C0198, 0.95)  (C0062, 0.93)
C0010  (C0009, 0.98)  (C0111, 0.97)  (C0103, 0.96)
C0011  (C0137, 0.96)  (C0169, 0.92)  (C0126, 0.92)
C0012  (C0104, 0.97)  (C0113, 0.93)  (C0195, 0.91)
C0013  (C0099, 0.99)  (C0108, 0.92)  (C0141, 0.84)
C0014  (C0060, 0.98)  (C0151, 0.91)   (C0097, 0.9)
C0015  (C0036, 0.98)  (C0131, 0.97)  (C0058, 0.84)
C0016   (C0183, 1.0)  (C0067, 0.92)  (C0042, 0.75)
C0017  (C0075, 0.97)  (C0081, 0.86)  (C0057, 0.85)
C0018  (C0117, 0.95)  (C0185, 0.84)  (C0046, 0.83)
C0019   (C0121, 0.9)  (C0081, 0