<a href="https://colab.research.google.com/github/003Palkush/Zeotap/blob/main/Palkush_Dave_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Dependacies and Loading FIle

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')




In [3]:
# Merge transactions with products to get category information
transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID')


Feature engineering

In [4]:
def create_customer_features(customers, transactions):
    # Customer profile features
    customer_features = customers.set_index('CustomerID')
    customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])
    customer_features['DaysSinceSignup'] = (pd.Timestamp.now() - customer_features['SignupDate']).dt.days

    # One-hot encode Region
    region_dummies = pd.get_dummies(customer_features['Region'], prefix='Region')
    customer_features = pd.concat([customer_features, region_dummies], axis=1)

    # Transaction-based features
    transaction_features = transactions.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean'],
        'TransactionID': 'count'
    })
    transaction_features.columns = ['TotalSpend', 'AvgOrderValue', 'TotalQuantity', 'AvgQuantity', 'TransactionCount']

    # Category preferences
    category_preferences = transactions.groupby(['CustomerID', 'Category'])['Quantity'].sum().unstack(fill_value=0)
    category_preferences = category_preferences.div(category_preferences.sum(axis=1), axis=0)

    # Combine all features
    all_features = customer_features.join(transaction_features).join(category_preferences)
    all_features = all_features.drop(['CustomerName', 'SignupDate', 'Region'], axis=1)

    return all_features

# Create features
customer_features = create_customer_features(customers, transactions)

# Replace NaN values with mean
customer_features = customer_features.fillna(customer_features.mean())

# Normalize features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(normalized_features)

# Function to get top 3 lookalikes
def get_lookalikes(customer_id, top_n=3):
    customer_index = customer_features.index.get_loc(customer_id)
    similarities = similarity_matrix[customer_index]
    top_indices = similarities.argsort()[-top_n-1:-1][::-1]
    top_similarities = similarities[top_indices]
    top_customers = customer_features.index[top_indices]
    return list(zip(top_customers, top_similarities))




Generating the file

In [5]:
# Generate lookalikes for first 20 customers
lookalike_results = {}
for customer_id in customers['CustomerID'][:20]:
    lookalikes = get_lookalikes(customer_id)
    lookalike_results[customer_id] = lookalikes

# Create Lookalike.csv
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index')
lookalike_df.columns = ['Lookalike1', 'Lookalike2', 'Lookalike3']
lookalike_df.to_csv('Lookalike.csv')

print("Lookalike model completed. Results saved to Lookalike.csv")

Lookalike model completed. Results saved to Lookalike.csv
