In [13]:
import pandas as pd

# Load the customer and transaction data
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

# Convert `TransactionDate` to datetime
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [15]:
# Merge `customers` and `transactions` data on `CustomerID`
cust_transactions = pd.merge(customers, transactions, on='CustomerID')

In [17]:
# Calculate Recency, Frequency, and Monetary (RFM) features and other relevant metrics
rfm_features = cust_transactions.groupby('CustomerID').agg(
    N_Transactions=('TransactionID', 'nunique'),
    Total_Quantity=('Quantity', 'sum'),
    Total_Value=('TotalValue', 'sum'),
    Avg_Days_Between_Transactions=('TransactionDate', lambda x: (x.max() - x.min()).days / len(x) if len(x) > 1 else 0),
    N_Unique_Products=('ProductID', 'nunique')
).reset_index()

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
similarity = cosine_similarity(rfm_features.drop(columns=['CustomerID']))

In [20]:
# Create a DataFrame to store the results
lookalike_df = pd.DataFrame(columns=['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3'])

# Get top 3 lookalikes for each customer
for i in range(len(rfm_features)):
    # Get similarity scores for the current customer
    similarity_scores = similarity[i]

    # Sort by similarity scores in descending order and get indices
    sorted_indices = similarity_scores.argsort()[::-1]

    # Get top 3 similar customer indices (excluding the customer itself)
    top_3_indices = [index for index in sorted_indices if index!= i][:3]

    # Get the corresponding CustomerIDs and similarity scores
    top_3_customers = rfm_features.iloc[top_3_indices]['CustomerID'].tolist()
    top_3_scores = similarity_scores[top_3_indices].tolist()

    # Append the results to the DataFrame
    lookalike_df.loc[i] = [rfm_features.iloc[i]['CustomerID'], *[item for sublist in [[cust, score] for cust, score in zip(top_3_customers, top_3_scores)] for item in sublist]]

In [23]:
# Print the first 20 rows
print(lookalike_df.head(20).to_markdown(index=False, numalign="left", stralign="left"))

| CustomerID   | Lookalike1   | Score1   | Lookalike2   | Score2   | Lookalike3   | Score3   |
|:-------------|:-------------|:---------|:-------------|:---------|:-------------|:---------|
| C0001        | C0011        | 1        | C0120        | 1        | C0048        | 1        |
| C0002        | C0062        | 0.999999 | C0010        | 0.999997 | C0005        | 0.999997 |
| C0003        | C0055        | 0.999999 | C0134        | 0.999999 | C0120        | 0.999999 |
| C0004        | C0073        | 1        | C0018        | 1        | C0145        | 1        |
| C0005        | C0062        | 0.999999 | C0002        | 0.999997 | C0115        | 0.999993 |
| C0006        | C0066        | 1        | C0044        | 1        | C0125        | 1        |
| C0007        | C0050        | 1        | C0052        | 0.999999 | C0035        | 0.999999 |
| C0008        | C0084        | 1        | C0124        | 0.999999 | C0199        | 0.999999 |
| C0009        | C0063        | 0.999945 | C0128  

In [25]:
# Save the lookalike data to a CSV file
lookalike_df[['CustomerID', 'Lookalike1', 'Score1']].to_csv("Lookalike.csv", index=False)