Lookalike Models Implementation

In [1]:
# Importing necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# 1. Data Loading
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# 2. Data Preprocessing
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Fill missing values with zeros for simplicity in this example
customers = customers.fillna({'Region': 'Unknown'})
transactions = transactions.fillna({'Quantity': 0, 'TotalValue': 0})

# 3. Feature Engineering: Create customer profiles based on transaction data
customer_features = transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    avg_order_value=('TotalValue', 'mean')
).reset_index()

# Merge customer profile features with customer information
customer_profiles = pd.merge(customers, customer_features, on='CustomerID', how='left').fillna(0)

# 4. Feature Normalization
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_profiles[['total_spent', 'transaction_count', 'avg_order_value']])

# 5. Compute Cosine Similarity Matrix
similarity_matrix = cosine_similarity(normalized_features)

# 6. Generate Lookalike Recommendations
lookalike_results = {}
for idx, customer_id in enumerate(customer_profiles['CustomerID']):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: -x[1])  # Sort by score
    top_3 = [(customer_profiles['CustomerID'][i], score) for i, score in similarity_scores[1:4]]  # Top 3 excluding the customer itself
    lookalike_results[customer_id] = top_3

# 7. Create Lookalike.csv for the first 20 customers (C0001 to C0020)
lookalike_df = pd.DataFrame({
    'cust_id': list(lookalike_results.keys()),
    'lookalikes': [str(v) for v in lookalike_results.values()]
})

# Filter for the first 20 customers (C0001 to C0020)
lookalike_df = lookalike_df[lookalike_df['cust_id'].apply(lambda x: int(x[1:]) <= 20)]

# Save results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Displaying a sample of the result
lookalike_df.head(10)


Unnamed: 0,cust_id,lookalikes
0,C0001,"[('C0137', 0.999217832279607), ('C0152', 0.992..."
1,C0002,"[('C0029', 0.9996304690463752), ('C0199', 0.99..."
2,C0003,"[('C0005', 0.9999316372091099), ('C0178', 0.99..."
3,C0004,"[('C0067', 0.9998110253764196), ('C0021', 0.99..."
4,C0005,"[('C0003', 0.9999316372091099), ('C0073', 0.99..."
5,C0006,"[('C0079', 0.9999839458199934), ('C0117', 0.99..."
6,C0007,"[('C0085', 0.9998021851965131), ('C0140', 0.99..."
7,C0008,"[('C0084', 0.9955933187236518), ('C0194', 0.99..."
8,C0009,"[('C0077', 0.9998301326192339), ('C0032', 0.99..."
9,C0010,"[('C0029', 0.9997659902062711), ('C0025', 0.99..."


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Convert date columns to datetime format
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Handle missing data
customers.fillna({'Region': 'Unknown'}, inplace=True)
transactions.fillna({'Quantity': 0, 'TotalValue': 0}, inplace=True)

# Feature Engineering

# 1. Customer Profile: Create customer features from transactional data
customer_features = transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    avg_order_value=('TotalValue', 'mean')
).reset_index()

# 2. Product Preferences: Aggregate product categories bought by the customer
product_preferences = transactions.merge(products[['ProductID', 'Category']], on='ProductID')
product_preferences = product_preferences.groupby(['CustomerID', 'Category']).agg(
    total_quantity=('Quantity', 'sum'),
    total_value=('TotalValue', 'sum')
).reset_index()

# Pivot the product preferences to get each customer’s preferences for each category
product_pivot = product_preferences.pivot_table(index='CustomerID', columns='Category', values='total_value', fill_value=0)

# Merge customer profile with product preferences
customer_profiles = pd.merge(customers, customer_features, on='CustomerID', how='left')
customer_profiles = pd.merge(customer_profiles, product_pivot, on='CustomerID', how='left').fillna(0)

# 3. Normalize features for similarity computation
features_to_normalize = ['total_spent', 'transaction_count', 'avg_order_value'] + list(product_pivot.columns)
scaler = StandardScaler()
customer_profiles[features_to_normalize] = scaler.fit_transform(customer_profiles[features_to_normalize])

# 4. Similarity Calculation (Cosine Similarity)
similarity_matrix = cosine_similarity(customer_profiles[features_to_normalize])

# 5. Generate Lookalike Recommendations for a given customer
def get_lookalikes(customer_id, top_n=3):
    # Get index of the given customer
    customer_idx = customer_profiles[customer_profiles['CustomerID'] == customer_id].index[0]
    
    # Get similarity scores for the customer
    similarity_scores = list(enumerate(similarity_matrix[customer_idx]))
    
    # Sort by similarity score (descending), and get the top N recommendations (excluding self)
    similarity_scores = sorted(similarity_scores, key=lambda x: -x[1])
    lookalikes = [(customer_profiles.iloc[i[0]]['CustomerID'], i[1]) for i in similarity_scores[1:top_n+1]]  # Exclude self
    
    return lookalikes

# Example: Get lookalikes for customer C0001
lookalikes_for_C0001 = get_lookalikes('C0001', top_n=3)
print(lookalikes_for_C0001)

# 6. Save lookalike recommendations for the first 20 customers
lookalike_results = {}
for customer_id in customer_profiles['CustomerID'][:20]:
    lookalike_results[customer_id] = get_lookalikes(customer_id)

# Save to CSV
lookalike_df = pd.DataFrame({
    'cust_id': list(lookalike_results.keys()),
    'lookalikes': [str(v) for v in lookalike_results.values()]
})
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display first few recommendations
lookalike_df.head()


[('C0069', 0.9684790049303688), ('C0091', 0.9260214176758534), ('C0072', 0.8757508889789443)]


Unnamed: 0,cust_id,lookalikes
0,C0001,"[('C0069', 0.9684790049303688), ('C0091', 0.92..."
1,C0002,"[('C0036', 0.8777774116934461), ('C0055', 0.86..."
2,C0003,"[('C0166', 0.927062271629985), ('C0007', 0.899..."
3,C0004,"[('C0075', 0.9806534502912463), ('C0065', 0.92..."
4,C0005,"[('C0197', 0.9786215349201892), ('C0166', 0.92..."


Code for Evaluating Model Accuracy and Logic

Data Preprocessing

In [3]:
# Merging customer and transaction data
transactions_with_customer = transactions.merge(customers, on='CustomerID', how='left')

# Feature engineering: Calculate total spend, transaction count, and average order value per customer
customer_features = transactions_with_customer.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    avg_order_value=('TotalValue', 'mean')
).reset_index()

# Merge with customer demographic data
customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID')

# Check for missing values after merging
print(customer_features.isnull().sum())


CustomerID           0
total_spent          0
transaction_count    0
avg_order_value      0
Region               0
dtype: int64


Lookalike Model

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Normalizing features for better similarity comparison
scaler = StandardScaler()
customer_features_normalized = scaler.fit_transform(customer_features[['total_spent', 'transaction_count', 'avg_order_value']])

# Compute the similarity matrix (Cosine Similarity)
similarity_matrix = cosine_similarity(customer_features_normalized)

# Function to get top N lookalikes for a customer
def get_top_lookalikes(customer_id, top_n=3):
    customer_idx = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(similarity_matrix[customer_idx]))
    
    # Sort customers by similarity score (highest first) and return the top N
    similarity_scores = sorted(similarity_scores, key=lambda x: -x[1])
    top_lookalikes = [(customer_features.iloc[i[0]]['CustomerID'], i[1]) for i in similarity_scores[1:top_n+1]]
    
    return top_lookalikes

# Test with customer C0001
top_lookalikes_c0001 = get_top_lookalikes('C0001', top_n=3)
print("Top 3 lookalikes for customer C0001:", top_lookalikes_c0001)


Top 3 lookalikes for customer C0001: [('C0137', 0.9993600788417096), ('C0152', 0.9956575062125335), ('C0121', 0.9930123335059389)]


Model Evaluation

In [5]:
# Inspect features of recommended lookalikes
def inspect_lookalikes(customer_id, top_n=3):
    lookalikes = get_top_lookalikes(customer_id, top_n)
    
    # Get the original customer's features
    original_customer = customer_features[customer_features['CustomerID'] == customer_id]
    
    # Get the lookalikes' features
    lookalike_customers = customer_features[customer_features['CustomerID'].isin([x[0] for x in lookalikes])]
    
    print(f"\nOriginal Customer {customer_id} features:")
    print(original_customer[['total_spent', 'transaction_count', 'avg_order_value', 'Region']])
    
    print("\nRecommended Lookalikes:")
    print(lookalike_customers[['CustomerID', 'total_spent', 'transaction_count', 'avg_order_value', 'Region']])

# Test inspection for customer C0001
inspect_lookalikes('C0001', top_n=3)



Original Customer C0001 features:
   total_spent  transaction_count  avg_order_value         Region
0      3354.52                  5          670.904  South America

Recommended Lookalikes:
    CustomerID  total_spent  transaction_count  avg_order_value         Region
120      C0121       974.52                  4          243.630         Europe
136      C0137      3332.55                  5          666.510  South America
151      C0152      3385.86                  5          677.172  South America
