In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np


In [4]:

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')



In [6]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])


merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')



In [7]:


# Aggregate transaction data for each customer
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',              # Total revenue
    'Quantity': 'sum',                # Total quantity purchased
    'ProductID': lambda x: x.nunique(),  # Number of unique products purchased
    'Category': lambda x: x.nunique()   # Number of unique categories purchased
}).reset_index()

customer_features.rename(columns={
    'TotalValue': 'TotalRevenue',
    'Quantity': 'TotalQuantity',
    'ProductID': 'UniqueProducts',
    'Category': 'UniqueCategories'
}, inplace=True)


customer_profiles = pd.merge(customer_features, customers[['CustomerID', 'Region']], on='CustomerID')

# One-hot encode of  Region column
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'], drop_first=True)

# Normalize transaction data
scaler = StandardScaler()
customer_profiles_normalized = scaler.fit_transform(customer_profiles.drop('CustomerID',axis=1))


# Calculate Cosine Similarity between customer profiles
similarity_matrix = cosine_similarity(customer_profiles_normalized)

# Create a DataFrame for the similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])

# Function to get top 3 similar customers
def get_top_3_similar(customers_df, customer_id):
    # Sort the similarity scores in descending order
    similar_customers = customers_df[customer_id].sort_values(ascending=False)[1:4]
    return [(index, score) for index, score in similar_customers.items()]

# Generate lookalikes for CustomerID C0001 to C0020
lookalike_results = {}

# Use actual CustomerIDs from the dataset
customer_ids = customer_profiles['CustomerID'][:20]  # Get the first 20 CustomerIDs

for customer_id in customer_ids:
    lookalike_results[customer_id] = get_top_3_similar(similarity_df, customer_id)

# Convert the lookalike results to a DataFrame
lookalike_df = pd.DataFrame([
    {'CustomerID': customer, 'LookalikeID': lookalike[0], 'Score': lookalike[1]}
    for customer, lookalikes in lookalike_results.items()
    for lookalike in lookalikes
])



In [9]:
# Save results as to Lookalike.csv
lookalike_df.to_csv('Likith_Adithya_Atmuri_Lookalike.csv', index=False)

# Evaluate Model Accuracy and Quality

# 1. Average similarity score 
average_similarity = lookalike_df['Score'].mean()

# 2. Coverage of recommendations (percentage of customers with recommendations)
coverage = (lookalike_df['CustomerID'].nunique() / len(customer_ids)) * 100

# Print metrics
print(f"Average Similarity Score for Top 3 Recommendations: {average_similarity:.4f}")
print(f"Recommendation Coverage: {coverage:.2f}%")


Average Similarity Score for Top 3 Recommendations: 0.9634
Recommendation Coverage: 100.00%
