In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [24]:
def load_and_prepare_data():
    """Load customer, product, and transaction data from Google Drive links"""
    customers_df = pd.read_csv('https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE&export=download')
    products_df = pd.read_csv('https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0&export=download')
    transactions_df = pd.read_csv('https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF&export=download')

    # Convert date columns to datetime
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

    return customers_df, products_df, transactions_df


In [25]:
def create_customer_features(customers_df, transactions_df, products_df):
    """Create comprehensive feature matrix for customers"""
    # Calculate account age
    customers_df['AccountAge'] = (pd.Timestamp.now() - customers_df['SignupDate']).dt.days

    # One-hot encode region
    region_dummies = pd.get_dummies(customers_df['Region'], prefix='Region')

    # Transaction-based features - Fixed aggregation
    transaction_features = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'Quantity': ['sum', 'mean'],
        'TotalValue': ['sum', 'mean'],
        'Price': 'mean'
    })

    # Flatten column names
    transaction_features.columns = [
        f"{col[0]}_{col[1]}" if col[1] else col[0]
        for col in transaction_features.columns
    ]

    # Rename columns for clarity
    transaction_features.columns = [
        'TransactionCount',
        'TotalQuantity',
        'AvgQuantity',
        'TotalSpend',
        'AvgSpend',
        'AvgPrice'
    ]

    # Fill NaN values with 0 for customers with no transactions
    transaction_features = transaction_features.fillna(0)

    # Product category preferences
    product_categories = transactions_df.merge(products_df[['ProductID', 'Category']], on='ProductID')
    category_pivot = pd.pivot_table(
        product_categories,
        values='Quantity',
        index='CustomerID',
        columns='Category',
        aggfunc='sum',
        fill_value=0
    )

    # Normalize category preferences
    category_sums = category_pivot.sum(axis=1)
    # Handle zero sums to avoid division by zero
    category_sums = category_sums.replace(0, 1)
    category_preferences = category_pivot.div(category_sums, axis=0)
    category_preferences.columns = [f'Category_{col}' for col in category_preferences.columns]

    # Combine all features
    base_features = customers_df[['CustomerID', 'AccountAge']].set_index('CustomerID')

    # Ensure all customers are included even if they have no transactions
    all_customers = pd.DataFrame(index=customers_df['CustomerID'])

    feature_matrix = pd.concat([
        all_customers,
        base_features,
        region_dummies,
        transaction_features,
        category_preferences
    ], axis=1)

    # Fill any remaining NaN values with 0
    feature_matrix = feature_matrix.fillna(0)

    return feature_matrix

In [26]:

def calculate_similarity_matrix(feature_matrix):
    """Calculate similarity scores between customers"""
    # Standardize features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(feature_matrix)

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(scaled_features)

    # Convert to DataFrame with customer IDs
    similarity_df = pd.DataFrame(
        similarity_matrix,
        index=feature_matrix.index,
        columns=feature_matrix.index
    )

    return similarity_df

In [27]:
def get_lookalikes(similarity_df, customer_id, n=3):
    """Get top N similar customers for a given customer ID"""
    if customer_id not in similarity_df.index:
        return []

    # Get similarities for the customer and sort
    customer_similarities = similarity_df[customer_id].sort_values(ascending=False)

    # Remove the customer themselves and get top N
    top_similar = customer_similarities[customer_similarities.index != customer_id][:n]

    return [
        {'customer_id': cust_id, 'similarity_score': score}
        for cust_id, score in top_similar.items()
    ]

In [28]:

def generate_lookalike_recommendations():
    """Generate lookalike recommendations for customers C0001-C0020"""
    # Load and prepare data
    print("Loading data...")
    customers_df, products_df, transactions_df = load_and_prepare_data()

    # Create feature matrix
    print("Creating customer features...")
    feature_matrix = create_customer_features(customers_df, transactions_df, products_df)

    # Calculate similarity matrix
    print("Calculating similarity scores...")
    similarity_df = calculate_similarity_matrix(feature_matrix)

    # Generate recommendations for C0001-C0020
    print("Generating lookalike recommendations...")
    recommendations = {}
    for i in range(1, 21):
        customer_id = f'C{i:04d}'
        lookalikes = get_lookalikes(similarity_df, customer_id)
        if lookalikes:
            recommendations[customer_id] = lookalikes

    # Create output DataFrame
    output_rows = []
    for cust_id, lookalikes in recommendations.items():
        lookalike_str = '|'.join(
            f"{rec['customer_id']}:{rec['similarity_score']:.4f}"
            for rec in lookalikes
        )
        output_rows.append({
            'CustomerID': cust_id,
            'Lookalikes': lookalike_str
        })

    # Save to CSV
    output_df = pd.DataFrame(output_rows)
    output_df.to_csv('Akanshu_Aich_Lookalike.csv', index=False)


    # Display first few recommendations
    print("\nFirst few recommendations:")
    print(output_df.head())

    return output_df



In [29]:

recommendations = generate_lookalike_recommendations()

Loading data...
Creating customer features...
Calculating similarity scores...
Generating lookalike recommendations...

First few recommendations:
  CustomerID                              Lookalikes
0      C0001  C0023:0.9462|C0045:0.9141|C0005:0.9091
1      C0002  C0159:0.9685|C0134:0.9301|C0133:0.9103
2      C0003  C0195:0.9101|C0031:0.9026|C0170:0.8836
3      C0004  C0113:0.9720|C0075:0.9682|C0017:0.9450
4      C0005  C0007:0.9779|C0197:0.9529|C0095:0.9164
