In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge transactions with products to get product category and price
transactions = pd.merge(transactions, products, on='ProductID')

# Feature Engineering: Create customer features
# 1. Total spending per customer
total_spending = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.columns = ['CustomerID', 'TotalSpending']

# 2. Average transaction value per customer
avg_transaction_value = transactions.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_transaction_value.columns = ['CustomerID', 'AvgTransactionValue']

# 3. Favorite product category (most purchased category)
favorite_category = transactions.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
favorite_category = favorite_category.loc[favorite_category.groupby('CustomerID')['Count'].idxmax()]
favorite_category = favorite_category[['CustomerID', 'Category']]
favorite_category.columns = ['CustomerID', 'FavoriteCategory']

# 4. Number of transactions per customer
num_transactions = transactions.groupby('CustomerID').size().reset_index(name='NumTransactions')

# 5. Total quantity purchased per customer
total_quantity = transactions.groupby('CustomerID')['Quantity'].sum().reset_index()
total_quantity.columns = ['CustomerID', 'TotalQuantity']

# Merge all features into a single customer feature dataframe
customer_features = pd.merge(customers, total_spending, on='CustomerID', how='left')
customer_features = pd.merge(customer_features, avg_transaction_value, on='CustomerID', how='left')
customer_features = pd.merge(customer_features, favorite_category, on='CustomerID', how='left')
customer_features = pd.merge(customer_features, num_transactions, on='CustomerID', how='left')
customer_features = pd.merge(customer_features, total_quantity, on='CustomerID', how='left')

# Handle missing values (if any)
customer_features.fillna(0, inplace=True)

# Encode categorical features (FavoriteCategory and Region)
customer_features = pd.get_dummies(customer_features, columns=['FavoriteCategory', 'Region'], drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['TotalSpending', 'AvgTransactionValue', 'NumTransactions', 'TotalQuantity']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

# Drop non-relevant columns for similarity calculation
customer_features.set_index('CustomerID', inplace=True)
customer_features.drop(columns=['CustomerName', 'SignupDate'], inplace=True)

# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features)

# Convert similarity matrix to a dataframe
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features.index, columns=customer_features.index)

# Function to get top 3 similar customers
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    # Exclude the customer itself (similarity score = 1)
    similar_customers = similarity_df[customer_id].drop(customer_id).sort_values(ascending=False).head(top_n)
    return similar_customers

# Generate recommendations for the first 20 customers (C0001 - C0020)
lookalike_map = {}
for customer_id in customer_features.index[:20]:
    similar_customers = get_top_similar_customers(customer_id, similarity_df)
    # Convert numpy.float64 to native Python float for better CSV compatibility
    lookalike_map[customer_id] = [[similar_customer, float(score)] for similar_customer, score in zip(similar_customers.index, similar_customers.values)]

# Convert the map to a dataframe
lookalike_df = pd.DataFrame(lookalike_map.items(), columns=['CustomerID', 'Lookalikes'])

# Save the lookalike recommendations to a CSV file
lookalike_df.to_csv('Ankur_Bhadauria_Lookalike.csv', index=False)

# Print the lookalike recommendations
print(lookalike_df)

   CustomerID                                         Lookalikes
0       C0001  [[C0190, 0.9449796330312132], [C0048, 0.943297...
1       C0002  [[C0088, 0.9567950735454178], [C0077, 0.908192...
2       C0003  [[C0052, 0.9054432967127763], [C0152, 0.837458...
3       C0004  [[C0165, 0.9764434526284471], [C0169, 0.928839...
4       C0005  [[C0186, 0.9780999104551312], [C0146, 0.962233...
5       C0006  [[C0168, 0.9735222664202972], [C0187, 0.941447...
6       C0007  [[C0140, 0.9807378835067342], [C0115, 0.930148...
7       C0008  [[C0109, 0.8455416700232936], [C0084, 0.815627...
8       C0009  [[C0198, 0.9643397739032491], [C0062, 0.929478...
9       C0010  [[C0111, 0.9230243277923992], [C0062, 0.904731...
10      C0011  [[C0137, 0.9378688431670885], [C0126, 0.920115...
11      C0012  [[C0104, 0.9736188526206875], [C0113, 0.939148...
12      C0013  [[C0099, 0.9830898690500509], [C0108, 0.927453...
13      C0014  [[C0060, 0.9784678112789252], [C0128, 0.958410...
14      C0015  [[C0131, 0