In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
#Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
# Step 2: Merge the datasets
# Merge transactions with customers
transactions_customers = transactions.merge(customers, on='CustomerID', how='left')

# Merge with products
full_data = transactions_customers.merge(products, on='ProductID', how='left')


In [4]:
# Step 3: Aggregate transaction data by customer
customer_metrics = full_data.groupby('CustomerID').agg(
    TotalSpend=('TotalValue', 'sum'),
    AvgSpend=('TotalValue', 'mean'),
    TotalTransactions=('TransactionID', 'count'),
    ProductDiversity=('ProductID', 'nunique')
).reset_index()

In [5]:
# Step 4: Encode categorical features
# One-hot encode the 'Region' column
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
region_encoded = encoder.fit_transform(customers[['Region']])
region_encoded_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))

# Merge encoded regions with customer metrics
customer_profile = customer_metrics.merge(customers[['CustomerID']], on='CustomerID')
customer_profile = pd.concat([customer_profile, region_encoded_df], axis=1)



In [6]:
# Step 5: Normalize numerical features
scaler = StandardScaler()
numerical_features = ['TotalSpend', 'AvgSpend', 'TotalTransactions', 'ProductDiversity']
customer_profile[numerical_features] = scaler.fit_transform(customer_profile[numerical_features])

In [11]:
# Step 6: Compute similarity matrix
# Extract features for similarity calculation
# Ensure only numerical columns are included
feature_matrix = customer_profile.drop(columns=['CustomerID'])

# Handle missing values, if any
feature_matrix = feature_matrix.fillna(0)  # Replace NaN with 0

# Check that all columns are numeric
feature_matrix = feature_matrix.select_dtypes(include=[np.number])

# Compute cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)


In [10]:
# Step 7: Find top 3 lookalike customers
lookalikes = {}
customer_ids = customer_profile['CustomerID'].tolist()

for i, customer_id in enumerate(customer_ids[:20]):  # Only process the first 20 customers
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)  # Sort by similarity score
    top_lookalikes = [
        (customer_ids[idx], round(score, 4))
        for idx, score in similarities if idx != i  # Exclude self-similarity
    ][:3]  # Take top 3
    lookalikes[customer_id] = top_lookalikes

In [12]:
# Step 8: Save results to CSV
lookalike_results = pd.DataFrame({
    'CustomerID': lookalikes.keys(),
    'Recommendations': [str(recs) for recs in lookalikes.values()]
})

lookalike_results.to_csv('Anrudh_Mekala_Lookalike.csv', index=False)

print("Lookalike results saved to Anirudh_Mekala_Lookalike.csv")

Lookalike results saved to Anirudh_Mekala_Lookalike.csv
