In [2]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
sns.set_theme()

In [3]:
# Function to load data
def load_data():
    customers_df = pd.read_csv('../data/Customers.csv')
    products_df = pd.read_csv('../data/Products.csv')
    transactions_df = pd.read_csv('../data/Transactions.csv')
    
    # Convert date columns
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    
    return customers_df, products_df, transactions_df

# Load data
customers_df, products_df, transactions_df = load_data()

In [4]:
def create_customer_features(customers_df, products_df, transactions_df):
    # Merge transactions with products to get categories
    transactions_with_categories = pd.merge(
        transactions_df,
        products_df[['ProductID', 'Category']],
        on='ProductID'
    )
    
    # Create purchase patterns by category
    category_patterns = pd.pivot_table(
        transactions_with_categories,
        values='Quantity',
        index='CustomerID',
        columns='Category',
        aggfunc='sum',
        fill_value=0
    )
    
    # Add customer spending metrics
    customer_metrics = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean'],
        'Quantity': 'sum'
    })
    
    customer_metrics.columns = ['transaction_count', 'total_spend', 'avg_spend', 'total_items']
    
    # Combine features
    features = pd.merge(
        category_patterns,
        customer_metrics,
        left_index=True,
        right_index=True
    )
    
    return features

# Create customer features
customer_features = create_customer_features(customers_df, products_df, transactions_df)
print("Feature matrix shape:", customer_features.shape)
print("\nFeatures included:", customer_features.columns.tolist())

Feature matrix shape: (200, 8)

Features included: ['Books', 'Clothing', 'Electronics', 'Home & Garden', 'transaction_count', 'total_spend', 'avg_spend', 'total_items']


In [5]:
def find_similar_customers(customer_features, customer_id, n=3):
    # Standardize features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(customer_features)
    
    # Calculate similarity
    similarities = cosine_similarity(scaled_features)
    
    # Get similar customers
    customer_idx = customer_features.index.get_loc(customer_id)
    similar_indices = similarities[customer_idx].argsort()[::-1][1:n+1]
    
    similar_customers = pd.DataFrame({
        'similar_customer_id': customer_features.index[similar_indices],
        'similarity_score': similarities[customer_idx][similar_indices]
    })
    
    return similar_customers

# Get first 20 customer IDs
target_customers = customers_df['CustomerID'].iloc[:20]

# Find similar customers for each target customer
results = []
for customer_id in target_customers:
    similar_customers = find_similar_customers(customer_features, customer_id)
    results.append({
        'customer_id': customer_id,
        'similar_customers': similar_customers
    })

In [6]:
# Create output DataFrame
output_df = pd.DataFrame([
    {
        'target_customer': r['customer_id'],
        'similar_customer_1': r['similar_customers'].iloc[0]['similar_customer_id'],
        'similarity_score_1': r['similar_customers'].iloc[0]['similarity_score'],
        'similar_customer_2': r['similar_customers'].iloc[1]['similar_customer_id'],
        'similarity_score_2': r['similar_customers'].iloc[1]['similarity_score'],
        'similar_customer_3': r['similar_customers'].iloc[2]['similar_customer_id'],
        'similarity_score_3': r['similar_customers'].iloc[2]['similarity_score']
    }
    for r in results
])

# Save results
output_df.to_csv('../reports/Lookalike.csv', index=False)
print("Results saved to '../reports/Lookalike.csv'")
display(output_df)

Results saved to '../reports/Lookalike.csv'


Unnamed: 0,target_customer,similar_customer_1,similarity_score_1,similar_customer_2,similarity_score_2,similar_customer_3,similarity_score_3
0,C0001,C0025,0.941557,C0109,0.828547,C0063,0.827273
1,C0002,C0066,0.963468,C0087,0.960637,C0020,0.948667
2,C0003,C0184,0.895497,C0079,0.789857,C0076,0.764142
3,C0004,C0023,0.888469,C0103,0.839457,C0127,0.813953
4,C0005,C0144,0.858129,C0083,0.851183,C0029,0.805656
5,C0006,C0089,0.925869,C0088,0.897347,C0100,0.896274
6,C0007,C0070,0.959461,C0186,0.882516,C0159,0.857345
7,C0008,C0145,0.954601,C0090,0.938301,C0187,0.937118
8,C0009,C0023,0.891941,C0101,0.764511,C0094,0.751601
9,C0010,C0035,0.788125,C0185,0.764845,C0194,0.76418


In [7]:
print("Verifying Lookalike.csv format...")
lookalike_df = pd.read_csv('../reports/Lookalike.csv')
print("\nColumns:", lookalike_df.columns.tolist())
print("\nFirst few rows:")
print(lookalike_df.head())

Verifying Lookalike.csv format...

Columns: ['target_customer', 'similar_customer_1', 'similarity_score_1', 'similar_customer_2', 'similarity_score_2', 'similar_customer_3', 'similarity_score_3']

First few rows:
  target_customer similar_customer_1  similarity_score_1 similar_customer_2  \
0           C0001              C0025            0.941557              C0109   
1           C0002              C0066            0.963468              C0087   
2           C0003              C0184            0.895497              C0079   
3           C0004              C0023            0.888469              C0103   
4           C0005              C0144            0.858129              C0083   

   similarity_score_2 similar_customer_3  similarity_score_3  
0            0.828547              C0063            0.827273  
1            0.960637              C0020            0.948667  
2            0.789857              C0076            0.764142  
3            0.839457              C0127            0.813953