In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Read the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Convert dates to datetime
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

# Create customer feature matrix
def create_customer_features(customers_df, transactions_df, products_df):
    # Customer transaction metrics
    customer_metrics = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean']
    }).round(2)
    
    # Flatten column names
    customer_metrics.columns = ['transaction_count', 'total_spend', 'avg_transaction_value', 
                              'total_quantity', 'avg_quantity']
    
    # Customer product category preferences
    trans_with_products = transactions_df.merge(products_df, on='ProductID')
    category_pivot = pd.crosstab(
        trans_with_products['CustomerID'], 
        trans_with_products['Category'],
        values=trans_with_products['TotalValue'],
        aggfunc='sum'
    ).fillna(0)
    
    # Normalize category preferences
    category_pivot = category_pivot.div(category_pivot.sum(axis=1), axis=0)
    
    # Customer recency and account age
    latest_date = transactions_df['TransactionDate'].max()
    customer_recency = transactions_df.groupby('CustomerID')['TransactionDate'].max()
    customer_recency = (latest_date - customer_recency).dt.days
    
    account_age = (latest_date - customers_df['SignupDate']).dt.days
    
    # Combine all features
    feature_matrix = pd.concat([
        customer_metrics,
        category_pivot,
        customer_recency.rename('recency'),
        account_age.rename('account_age')
    ], axis=1).fillna(0)
    
    return feature_matrix

# Create feature matrix
feature_matrix = create_customer_features(customers_df, transactions_df, products_df)

# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(feature_matrix)
scaled_features_df = pd.DataFrame(scaled_features, index=feature_matrix.index, 
                                columns=feature_matrix.columns)

def get_lookalikes(customer_id, n_recommendations=3):
    """Get top n similar customers for a given customer ID"""
    if customer_id not in scaled_features_df.index:
        return []
    
    # Calculate cosine similarity
    customer_vector = scaled_features_df.loc[customer_id].values.reshape(1, -1)
    similarities = cosine_similarity(customer_vector, scaled_features_df)
    
    # Get top similar customers (excluding self)
    similar_indices = similarities[0].argsort()[::-1][1:n_recommendations+1]
    similar_scores = similarities[0][similar_indices]
    
    # Get customer IDs and scores
    similar_customers = scaled_features_df.index[similar_indices]
    
    return list(zip(similar_customers, similar_scores))

# Generate recommendations for first 20 customers
results = {}
for cust_id in customers_df['CustomerID'][:20]:
    lookalikes = get_lookalikes(cust_id)
    results[cust_id] = [
        {'customer_id': rec[0], 'similarity_score': round(rec[1], 4)}
        for rec in lookalikes
    ]

# Create output DataFrame
output_rows = []
for cust_id, recommendations in results.items():
    rec_str = '; '.join([
        f"{rec['customer_id']}({rec['similarity_score']})"
        for rec in recommendations
    ])
    output_rows.append({
        'CustomerID': cust_id,
        'Recommendations': rec_str
    })

output_df = pd.DataFrame(output_rows)

# Save to CSV
output_df.to_csv('Lookalike.csv', index=False)

# Display results
print("Lookalike recommendations for first 20 customers:")
print("\nFormat: CustomerID -> [Recommended_Customer1(similarity_score), ...]")
print("-" * 80)
for cust_id, recs in results.items():
    print(f"{cust_id} -> {recs}")

Lookalike recommendations for first 20 customers:

Format: CustomerID -> [Recommended_Customer1(similarity_score), ...]
--------------------------------------------------------------------------------
C0001 -> [{'customer_id': 'C0069', 'similarity_score': 0.9682}, {'customer_id': 'C0072', 'similarity_score': 0.9401}, {'customer_id': 'C0157', 'similarity_score': 0.9276}]
C0002 -> [{'customer_id': 'C0159', 'similarity_score': 0.9597}, {'customer_id': 'C0134', 'similarity_score': 0.9541}, {'customer_id': 'C0178', 'similarity_score': 0.935}]
C0003 -> [{'customer_id': 'C0166', 'similarity_score': 0.9559}, {'customer_id': 'C0007', 'similarity_score': 0.946}, {'customer_id': 'C0026', 'similarity_score': 0.9225}]
C0004 -> [{'customer_id': 'C0075', 'similarity_score': 0.9893}, {'customer_id': 'C0065', 'similarity_score': 0.9705}, {'customer_id': 'C0041', 'similarity_score': 0.9582}]
C0005 -> [{'customer_id': 'C0085', 'similarity_score': 0.9525}, {'customer_id': 'C0007', 'similarity_score': 0.92