<a href="https://colab.research.google.com/github/AdityaMVerma/Zeotap/blob/main/ADITYA_VERMA_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Step 1: Data Preprocessing
# Convert TransactionDate to datetime format
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

# Merge customer and transaction data
data = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')

# Aggregate transaction data by CustomerID
customer_transactions = data.groupby('CustomerID').agg(
    TotalSpent=('TotalValue', 'sum'),
    TransactionCount=('TransactionID', 'nunique'),
    LastPurchase=('TransactionDate', 'max')
).reset_index()

# Feature Engineering for customer demographics
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
customers_df['DaysSinceSignup'] = (pd.to_datetime('today') - customers_df['SignupDate']).dt.days

# Merge customer features with transaction features
data = pd.merge(customer_transactions, customers_df[['CustomerID', 'Region', 'DaysSinceSignup']], on='CustomerID', how='left')

# Calculate recency: number of days since last purchase
data['DaysSinceLastPurchase'] = (pd.to_datetime('today') - data['LastPurchase']).dt.days

# One-hot encode the 'Region' feature
data = pd.concat([data, pd.get_dummies(data['Region'], prefix='Region')], axis=1)

# Standardize the numeric features for better comparison
scaler = StandardScaler()
data[['TotalSpent', 'TransactionCount', 'DaysSinceSignup', 'DaysSinceLastPurchase']] = scaler.fit_transform(
    data[['TotalSpent', 'TransactionCount', 'DaysSinceSignup', 'DaysSinceLastPurchase']]
)

# Step 2: Compute similarity between customers
customer_features = data.drop(columns=['CustomerID', 'Region', 'LastPurchase'])
similarity_matrix = cosine_similarity(customer_features)

# Step 3: Extract top 3 most similar customers for each of the first 20 customers
lookalike_dict = {}
for i in range(20):  # First 20 customers (C0001 - C0020)
    similarities = similarity_matrix[i]  # Get similarity scores for customer i
    # Get the top 3 most similar customers (excluding the customer itself)
    similar_indices = similarities.argsort()[-4:-1][::-1]
    similar_customers = [(data['CustomerID'].iloc[idx], similarities[idx]) for idx in similar_indices]
    lookalike_dict[data['CustomerID'].iloc[i]] = similar_customers

    # Print the top 3 lookalikes for the current customer
    print(f"Top 3 lookalikes for Customer {data['CustomerID'].iloc[i]}:")
    for similar_cust, score in similar_customers:
        print(f"  Customer ID: {similar_cust}, Similarity Score: {score:.4f}")
    print()

# Step 4: Save results to Lookalike.csv
lookalike_data = []
for cust_id, similar_customers in lookalike_dict.items():
    for similar_cust, score in similar_customers:
        lookalike_data.append([cust_id, similar_cust, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['cust_id', 'lookalike_cust_id', 'similarity_score'])

# Saving to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)


Top 3 lookalikes for Customer C0001:
  Customer ID: C0152, Similarity Score: 0.9875
  Customer ID: C0118, Similarity Score: 0.9305
  Customer ID: C0107, Similarity Score: 0.9161

Top 3 lookalikes for Customer C0002:
  Customer ID: C0134, Similarity Score: 0.9581
  Customer ID: C0159, Similarity Score: 0.9460
  Customer ID: C0005, Similarity Score: 0.9313

Top 3 lookalikes for Customer C0003:
  Customer ID: C0052, Similarity Score: 0.9940
  Customer ID: C0129, Similarity Score: 0.8751
  Customer ID: C0085, Similarity Score: 0.8549

Top 3 lookalikes for Customer C0004:
  Customer ID: C0165, Similarity Score: 0.9792
  Customer ID: C0113, Similarity Score: 0.9689
  Customer ID: C0188, Similarity Score: 0.9593

Top 3 lookalikes for Customer C0005:
  Customer ID: C0159, Similarity Score: 0.9629
  Customer ID: C0027, Similarity Score: 0.9545
  Customer ID: C0002, Similarity Score: 0.9313

Top 3 lookalikes for Customer C0006:
  Customer ID: C0187, Similarity Score: 0.8762
  Customer ID: C0085,

In [None]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Convert the 'TransactionDate' column to datetime
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'], errors='coerce')

# Convert 'SignupDate' to datetime
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'], errors='coerce')

# Load the Lookalike.csv file
lookalike_df = pd.read_csv('Lookalike.csv')

# Function to compute similarity dynamically for a new customer
def compute_similarity_for_new_customer(customer_id):
    # Merge customer and transaction data
    data = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')

    # Aggregate transaction data by CustomerID
    customer_transactions = data.groupby('CustomerID').agg(
        TotalSpent=('TotalValue', 'sum'),
        TransactionCount=('TransactionID', 'nunique'),
        LastPurchase=('TransactionDate', 'max')
    ).reset_index()

    # Feature Engineering for customer demographics
    customers_df['DaysSinceSignup'] = (pd.to_datetime('today') - customers_df['SignupDate']).dt.days

    # Merge customer features with transaction features
    data = pd.merge(customer_transactions, customers_df[['CustomerID', 'Region', 'DaysSinceSignup']], on='CustomerID', how='left')

    # Calculate recency: number of days since last purchase
    data['DaysSinceLastPurchase'] = (pd.to_datetime('today') - data['LastPurchase']).dt.days

    # One-hot encode the 'Region' feature
    data = pd.concat([data, pd.get_dummies(data['Region'], prefix='Region')], axis=1)

    # Standardize the numeric features for better comparison
    scaler = StandardScaler()
    data[['TotalSpent', 'TransactionCount', 'DaysSinceSignup', 'DaysSinceLastPurchase']] = scaler.fit_transform(
        data[['TotalSpent', 'TransactionCount', 'DaysSinceSignup', 'DaysSinceLastPurchase']]
    )

    # Find the index of the new customer in the data
    if customer_id not in data['CustomerID'].values:
        return []

    customer_index = data[data['CustomerID'] == customer_id].index[0]

    # Compute similarity with all other customers
    customer_features = data.drop(columns=['CustomerID', 'Region', 'LastPurchase'])
    similarity_matrix = cosine_similarity(customer_features)

    # Get the top 3 most similar customers (excluding the customer itself)
    similarities = similarity_matrix[customer_index]
    similar_indices = similarities.argsort()[-4:-1][::-1]
    similar_customers = [(data['CustomerID'].iloc[idx], similarities[idx]) for idx in similar_indices]

    return similar_customers

# Function to search lookalikes
def search_lookalikes(customer_id):
    # Clear any previous output
    clear_output(wait=True)

    # Check if the customer is within the first 20 (C0001 - C0020)
    if customer_id in customers_df['CustomerID'].values:
        lookalike_results = lookalike_df[lookalike_df['cust_id'] == customer_id]

        if not lookalike_results.empty:
            print(f"Top 3 lookalikes for Customer {customer_id} (Precomputed):\n")

            for index, row in lookalike_results.iterrows():
                print(f"Customer ID: {row['lookalike_cust_id']}, Similarity Score: {row['similarity_score']:.4f}")
        else:
            # If the customer is not in the precomputed file, compute the similarity dynamically
            similar_customers = compute_similarity_for_new_customer(customer_id)

            if similar_customers:
                print(f"Top 3 lookalikes for Customer {customer_id} (Dynamically Computed):\n")

                for similar_cust, score in similar_customers:
                    print(f"Customer ID: {similar_cust}, Similarity Score: {score:.4f}")
            else:
                print("Customer ID not found.")

    display(search_button, customer_id_input)

# Create the input field for Customer ID
customer_id_input = widgets.Text(
    value='',
    placeholder='Enter Customer ID',
    description='Customer ID:',
    disabled=False
)

# Create the search button
search_button = widgets.Button(description="Search Lookalikes")

# Bind the button to the search function
search_button.on_click(lambda b: search_lookalikes(customer_id_input.value))

# Display the widgets
display(customer_id_input, search_button)


Top 3 lookalikes for Customer C0077 (Dynamically Computed):

Customer ID: C0032, Similarity Score: 0.9604
Customer ID: C0151, Similarity Score: 0.8958
Customer ID: C0095, Similarity Score: 0.8866


Button(description='Search Lookalikes', style=ButtonStyle())

Text(value='C0077', description='Customer ID:', placeholder='Enter Customer ID')