In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers_df = pd.read_csv('data/Customers.csv')
products_df = pd.read_csv('data/Products.csv')
transactions_df = pd.read_csv('data/Transactions.csv')

In [3]:
def create_customer_features():
    purchase_features = transactions_df.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean', 'count'],
        'Quantity': ['sum', 'mean']
    }).fillna(0)    
    purchase_features.columns = ['total_spend', 'avg_transaction_value', 'transaction_count', 'total_quantity', 'avg_quantity']
    trans_products = transactions_df.merge(products_df, on='ProductID')
    category_spending = trans_products.pivot_table(
        index='CustomerID',
        columns='Category',
        values='TotalValue',
        aggfunc='sum',
        fill_value=0
    )
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    latest_transaction = transactions_df.groupby('CustomerID')['TransactionDate'].max()
    max_date = latest_transaction.max()
    recency = (max_date - latest_transaction).dt.days
    customer_features = purchase_features.join(category_spending).join(recency.rename('recency'))
    return customer_features

In [4]:
def find_lookalikes(customer_id, feature_matrix, n_recommendations=3):
    customer_index = feature_matrix.index.get_loc(customer_id)
    similarities = cosine_similarity([feature_matrix.iloc[customer_index]], feature_matrix)[0]
    similar_indices = np.argsort(similarities)[::-1][1:n_recommendations+1]
    recommendations = [
        (feature_matrix.index[idx], similarities[idx])
        for idx in similar_indices
    ]
    return recommendations

In [5]:
customer_features = create_customer_features()

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features)
scaled_features_df = pd.DataFrame(scaled_features, index=customer_features.index, columns=customer_features.columns)

results = {}
for customer_id in customers_df['CustomerID'][:20]:
    recommendations = find_lookalikes(customer_id, scaled_features_df)
    results[customer_id] = [(cust_id, float(score)) for cust_id, score in recommendations]

output_data = []
for cust_id, recs in results.items():
    rec_list = [f"({cid},{score:.4f})" for cid, score in recs]
    formatted_row = f"{cust_id},[{','.join(rec_list)}]"
    output_data.append(formatted_row)

with open('Anurag_Pathak_Lookalike.csv', 'w') as f:
    f.write("CustomerID,Recommendations\n")
    f.write('\n'.join(output_data))

In [6]:
for customer_id, recommendations in results.items():
    current_customer = customers_df[customers_df['CustomerID'] == customer_id].iloc[0]
    print(f"\nCustomer {customer_id} - {current_customer['CustomerName']} ({current_customer['Region']}) lookalikes:")
    
    for rec_id, score in recommendations:
        rec_customer = customers_df[customers_df['CustomerID'] == rec_id].iloc[0]
        print(f"  {rec_id} - {rec_customer['CustomerName']} ({rec_customer['Region']}): {score:.4f}")


Customer C0001 - Lawrence Carroll (South America) lookalikes:
  C0069 - Stacy Foster (Europe): 0.9234
  C0072 - Sarah Scott (North America): 0.8632
  C0183 - Kimberly Johnson (North America): 0.8217

Customer C0002 - Elizabeth Lutz (Asia) lookalikes:
  C0029 - Erin Manning (North America): 0.8631
  C0036 - Brian Aguilar DDS (North America): 0.8521
  C0159 - Austin Miller (Asia): 0.8472

Customer C0003 - Michael Rivera (South America) lookalikes:
  C0144 - Andrea Hart (North America): 0.7973
  C0026 - Sara Miller (North America): 0.7181
  C0166 - John Rogers (Europe): 0.7046

Customer C0004 - Kathleen Rodriguez (South America) lookalikes:
  C0075 - Misty Higgins (Europe): 0.9735
  C0065 - Gerald Hines (North America): 0.9230
  C0113 - Joseph Ortiz Jr. (South America): 0.8727

Customer C0005 - Laura Weber (Asia) lookalikes:
  C0085 - Richard Brown (South America): 0.8880
  C0123 - Jason Johnston (Asia): 0.8496
  C0095 - William Walker (South America): 0.8317

Customer C0006 - Brittany P