In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Step 1: Load and merge the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
merged = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')
print(merged.columns)

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


In [3]:
# Step 1: Aggregating Customer Features
customer_features = merged.groupby('CustomerID').agg({
    'Region': 'first',             # Region: use the first region for each customer
    'TotalValue': 'sum',           # Total revenue from the customer
    'Quantity': 'sum',             # Total quantity purchased by the customer
    'Price_x': 'mean',             # Average product price in transactions (Price_x from Transactions.csv)
    'ProductID': 'nunique',        # Number of unique products purchased
}).reset_index()

In [4]:
# Step 2: Encoding Categorical Data (Region)
le_region = LabelEncoder()
customer_features['Region'] = le_region.fit_transform(customer_features['Region'])


In [5]:
# Step 3: Standardizing the Numeric Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[['TotalValue', 'Quantity', 'Price_x', 'ProductID']])


In [6]:
# Step 4: Calculating Cosine Similarity
similarity_matrix = cosine_similarity(scaled_features)

In [7]:
# Step 5: Finding Top 3 Lookalikes for Each Customer (For the first 20 customers)
lookalike_map = {}

# Loop over the first 20 customers (C0001 to C0020)
for idx, customer_id in enumerate(customer_features['CustomerID'][:20]):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    # Sort by similarity score (descending) and exclude the customer itself
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = similarity_scores[1:4]  # Get top 3 similar customers
    lookalike_map[customer_id] = [
        (customer_features['CustomerID'].iloc[i], round(score, 4)) for i, score in top_3
    ]


In [8]:
# Step 6: Generating Lookalike CSV
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])

# Save to Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display Lookalike Map for First 20 Customers
print(lookalike_df)

   CustomerID                                         Lookalikes
0       C0001  [(C0137, 0.9697), (C0103, 0.9658), (C0191, 0.9...
1       C0002  [(C0029, 0.9998), (C0077, 0.9939), (C0025, 0.9...
2       C0003  [(C0010, 0.9551), (C0111, 0.9327), (C0176, 0.9...
3       C0004  [(C0075, 0.9971), (C0068, 0.9852), (C0175, 0.9...
4       C0005  [(C0130, 0.998), (C0128, 0.9968), (C0020, 0.99...
5       C0006  [(C0196, 0.9952), (C0079, 0.9882), (C0168, 0.9...
6       C0007  [(C0125, 0.997), (C0085, 0.9967), (C0078, 0.98...
7       C0008  [(C0179, 0.9917), (C0090, 0.984), (C0084, 0.96...
8       C0009  [(C0192, 0.9984), (C0128, 0.9864), (C0061, 0.9...
9       C0010  [(C0142, 0.985), (C0121, 0.9744), (C0094, 0.96...
10      C0011  [(C0023, 0.9934), (C0100, 0.9668), (C0064, 0.9...
11      C0012  [(C0045, 0.9957), (C0143, 0.9943), (C0041, 0.9...
12      C0013  [(C0059, 0.998), (C0141, 0.997), (C0104, 0.9942)]
13      C0014  [(C0097, 0.998), (C0036, 0.9959), (C0063, 0.99...
14      C0015  [(C0058, 0