In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
# Load datasets
customers = pd.read_csv("../data/Customers.csv")
transactions = pd.read_csv("../data/Transactions.csv")

In [3]:
# Merge datasets
data = pd.merge(transactions, customers, on="CustomerID")

In [4]:
# Feature engineering: Aggregate customer data
customer_summary = data.groupby('CustomerID').agg({
    'TotalValue': 'sum', 
    'Quantity': 'sum'
}).reset_index()


In [5]:
# Normalize data for similarity calculation
scaler = StandardScaler()
normalized_data = scaler.fit_transform(customer_summary[['TotalValue', 'Quantity']])

In [6]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(normalized_data)

In [7]:
# Create a DataFrame for similarity scores
similarity_df = pd.DataFrame(
    similarity_matrix, 
    index=customer_summary['CustomerID'], 
    columns=customer_summary['CustomerID']
)


In [8]:
# Generate lookalikes for the first 20 customers
lookalikes = {}
for customer in similarity_df.index[:20]:  # For the first 20 customers
    similar_customers = similarity_df.loc[customer].sort_values(ascending=False)[1:4]  # Top 3 excluding self
    lookalikes[customer] = [(cust_id, round(score, 4)) for cust_id, score in similar_customers.items()]


In [9]:
# Convert lookalikes dictionary into a DataFrame
lookalikes_list = []
for customer_id, similar_customers in lookalikes.items():
    for similar_customer, score in similar_customers:
        lookalikes_list.append({
            "CustomerID": customer_id,
            "SimilarCustomerID": similar_customer,
            "SimilarityScore": score
        })

lookalikes_df = pd.DataFrame(lookalikes_list)

In [10]:
# Save the lookalikes to a CSV file
lookalikes_df.to_csv("../outputs/LalithaPriyadarshini_Baswapatri_Lookalike.csv", index=False)

# Display the lookalikes for verification
print("Top 3 lookalikes for the first 20 customers:")
print(lookalikes_df)

Top 3 lookalikes for the first 20 customers:
   CustomerID SimilarCustomerID  SimilarityScore
0       C0001             C0085           1.0000
1       C0001             C0042           0.9998
2       C0001             C0089           0.9998
3       C0002             C0157           1.0000
4       C0002             C0166           0.9999
5       C0002             C0029           0.9998
6       C0003             C0111           0.9940
7       C0003             C0160           0.9905
8       C0003             C0147           0.9876
9       C0004             C0162           1.0000
10      C0004             C0165           1.0000
11      C0004             C0090           0.9986
12      C0005             C0080           1.0000
13      C0005             C0167           1.0000
14      C0005             C0177           0.9999
15      C0006             C0079           1.0000
16      C0006             C0117           0.9957
17      C0006             C0196           0.9903
18      C0007           