# **Data Science Assignment Zeotap: eCommerce Transactions Dataset**

**Elizabeth Mariya Jose**



## **Task 2**

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
# Load the datasets

customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [None]:
# Preprocess the datasets
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Aggregate transaction data to create customer-level features
transaction_features = transactions.groupby('CustomerID').agg(
    TotalSpend=('TotalValue', 'sum'),
    TotalQuantity=('Quantity', 'sum'),
    UniqueProducts=('ProductID', 'nunique'),
    AvgTransactionValue=('TotalValue', 'mean'),
    TotalTransactions=('TransactionID', 'count')
).reset_index()

# Merge customer features with the customer profile data
customer_data = pd.merge(customers, transaction_features, on='CustomerID', how='left')

# Fill missing values for customers with no transactions
customer_data.fillna({
    'TotalSpend': 0,
    'TotalQuantity': 0,
    'UniqueProducts': 0,
    'AvgTransactionValue': 0,
    'TotalTransactions': 0
}, inplace=True)


customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)


numerical_features = ['TotalSpend', 'TotalQuantity', 'UniqueProducts', 'AvgTransactionValue', 'TotalTransactions']
scaler = StandardScaler()
customer_data[numerical_features] = scaler.fit_transform(customer_data[numerical_features])

# Calculate pairwise similarity using Cosine Similarity
similarity_features = customer_data.drop(columns=['CustomerID', 'CustomerName', 'SignupDate'])
similarity_matrix = cosine_similarity(similarity_features)

#Find the top 3 most similar customers for each customer
lookalikes = {}
for idx, row in enumerate(similarity_matrix):
    similar_indices = row.argsort()[-4:-1][::-1]  # Top 3 excluding itself
    similar_customers = [
        (customer_data['CustomerID'].iloc[i], round(row[i], 3)) for i in similar_indices
    ]
    lookalikes[customer_data['CustomerID'].iloc[idx]] = similar_customers

# Filter results for CustomerID C0001 to C0020
lookalike_subset = {k: v for k, v in lookalikes.items() if k in customer_data['CustomerID'][:20].values}

# Prepare output for "Lookalike.csv"
output = []
for cust_id, similar_list in lookalike_subset.items():
    output.append({
        'CustomerID': cust_id,
        'SimilarCustomers': similar_list
    })

# Convert to a DataFrame and save as CSV
lookalike_df = pd.DataFrame(output)
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv has been successfully generated!")


Lookalike.csv has been successfully generated!
