In [1]:
import pandas as pd

In [2]:
customers = pd.read_csv('Customers.csv')

In [10]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [3]:
transactions = pd.read_csv('Transactions.csv')

In [6]:
products = pd.read_csv('Products.csv')

In [7]:
# Merge Transactions with Products on ProductID to get product details
df_merged = pd.merge(transactions, products, on='ProductID', how='left')

In [8]:
#  Merge the result with Customers data on CustomerID
df_merged = pd.merge(df_merged, customers, on='CustomerID', how='left')

In [11]:
#  Aggregating transaction and product data per customer
customer_data = df_merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',  # Total quantity bought
    'Region': 'first',  # Assuming each customer belongs to one region
    'SignupDate': 'first',  # Signup year
    'Category': lambda x: x.mode()[0],  # Most common product category
}).reset_index()

In [13]:
#Normalize the data for similarity calculation
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
customer_data[['TotalValue', 'Quantity']] = scaler.fit_transform(customer_data[['TotalValue', 'Quantity']])

In [14]:
# Cosine Similarity calculation
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(customer_data[['TotalValue', 'Quantity']])


In [15]:
# Function to get top 3 similar customers based on cosine similarity
def get_similar_customers(customer_id, top_n=3):
    if customer_id in customer_data['CustomerID'].values:
        customer_index = customer_data[customer_data['CustomerID'] == customer_id].index[0]
        similarity_scores = similarity_matrix[customer_index]
        similar_indices = similarity_scores.argsort()[::-1][1:top_n+1]  # Exclude the customer itself
        similar_customers = customer_data.iloc[similar_indices]
        return similar_customers[['CustomerID', 'TotalValue', 'Quantity', 'Region']], similarity_scores[similar_indices]
    else:
        return None


In [17]:
# Creating Lookalike.csv for first 20 customers
lookalike_list = []
for customer_id in customer_data['CustomerID'][:20]:
    similar_customers, similarity_scores = get_similar_customers(customer_id, top_n=3)
    if similar_customers is not None:
        for i in range(len(similar_customers)):
            lookalike_list.append({
                'CustomerID': customer_id,
                'LookalikeCustomerID': similar_customers.iloc[i]['CustomerID'],
                'SimilarityScore': similarity_scores[i]
            })

In [19]:
lookalike_df = pd.DataFrame(lookalike_list)
#  Export to Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)

In [21]:
test_customers = customer_data['CustomerID'][:20]  # First 20 customers
for customer_id in test_customers:
    similar_customers, similarity_scores = get_similar_customers(customer_id, top_n=3)
    if similar_customers is not None:
        print(f"CustomerID: {customer_id}")
        # Use enumerate to get the index within similar_customers
        for i, (idx, row) in enumerate(similar_customers.iterrows()):
            print(f"LookalikeCustomerID: {row['CustomerID']}, SimilarityScore: {similarity_scores[i]}")  # Access using i
        print("-" * 50)
    else:
        print(f"CustomerID {customer_id} not found.\n")

CustomerID: C0001
LookalikeCustomerID: C0085, SimilarityScore: 0.9999990504724361
LookalikeCustomerID: C0042, SimilarityScore: 0.9998215747742084
LookalikeCustomerID: C0089, SimilarityScore: 0.9997850140987701
--------------------------------------------------
CustomerID: C0002
LookalikeCustomerID: C0157, SimilarityScore: 0.9999942410168485
LookalikeCustomerID: C0166, SimilarityScore: 0.999875010843091
LookalikeCustomerID: C0029, SimilarityScore: 0.9998254255985104
--------------------------------------------------
CustomerID: C0003
LookalikeCustomerID: C0111, SimilarityScore: 0.9940081095432594
LookalikeCustomerID: C0160, SimilarityScore: 0.9904545038572361
LookalikeCustomerID: C0147, SimilarityScore: 0.9876382719212549
--------------------------------------------------
CustomerID: C0004
LookalikeCustomerID: C0162, SimilarityScore: 0.9999999965087093
LookalikeCustomerID: C0165, SimilarityScore: 0.9999594720114721
LookalikeCustomerID: C0090, SimilarityScore: 0.9986409558134951
--------