In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [3]:
# Loading datasets
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")

In [6]:
# Merging datasets to create a unified view that combines customer, product, and transaction details.
merged = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')

In [8]:
# Feature Engineering
# Ensures 'Price' exists by calculating it if missing
if 'Price' not in merged.columns:
    merged['Price'] = merged['TotalValue'] / merged['Quantity']

In [9]:
# Aggregating transaction data per customer
customer_features = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',  # Total quantity purchased
    'Price': 'mean',  # Average price of purchased items
    'Category': lambda x: x.mode()[0] if len(x) > 0 else None  # Most purchased category
}).reset_index()

In [10]:
'''One-hot encoding the 'Category' feature -
 (refers to the process of converting categorical data (like product categories)
 into a numerical format that machine learning models can process effectively.)'''
customer_features = pd.get_dummies(customer_features, columns=['Category'], prefix='Category')

In [11]:
# Normalizing numerical features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

In [12]:
# Calculating Similarities
similarity_matrix = cosine_similarity(scaled_features)

In [14]:
# Generating Lookalike Recommendations
customer_ids = customer_features['CustomerID'].values
lookalike_results = []
for i, cust_id in enumerate(customer_ids[:20]):  # Limiting to first 20 customers (C0001-C0020)
    # Getting similarity scores for current customer
    scores = list(enumerate(similarity_matrix[i]))
    # Sorting by similarity score (excluding the customer itself)
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = [(customer_ids[j], score) for j, score in scores if customer_ids[j] != cust_id]
    # Taking top 3 most similar customers
    top_3 = scores[:3]
    # Appending to results
    lookalike_results.append({
        "cust_id": cust_id,
        "lookalikes": [(x[0], round(x[1], 2)) for x in top_3]
    })

In [15]:
# Creating Lookalike.csv
lookalike_df = pd.DataFrame({
    "cust_id": [res['cust_id'] for res in lookalike_results],
    "lookalikes": [res['lookalikes'] for res in lookalike_results]
})
lookalike_df.to_csv("Lookalike.csv", index=False)

In [17]:
# Output example of lookalike results for verification
print(lookalike_df.head(20))

   cust_id                                     lookalikes
0    C0001     [(C0154, 1.0), (C0026, 1.0), (C0069, 1.0)]
1    C0002     [(C0029, 1.0), (C0088, 1.0), (C0062, 1.0)]
2    C0003     [(C0038, 1.0), (C0160, 1.0), (C0189, 1.0)]
3    C0004     [(C0017, 1.0), (C0041, 1.0), (C0175, 1.0)]
4    C0005     [(C0192, 1.0), (C0186, 1.0), (C0140, 1.0)]
5    C0006     [(C0117, 1.0), (C0139, 1.0), (C0064, 1.0)]
6    C0007     [(C0146, 1.0), (C0115, 1.0), (C0050, 1.0)]
7    C0008     [(C0113, 1.0), (C0136, 1.0), (C0124, 1.0)]
8    C0009   [(C0150, 1.0), (C0198, 0.99), (C0061, 0.99)]
9    C0010   [(C0176, 1.0), (C0029, 0.99), (C0144, 0.99)]
10   C0011     [(C0139, 1.0), (C0064, 1.0), (C0024, 1.0)]
11   C0012     [(C0163, 1.0), (C0182, 1.0), (C0124, 1.0)]
12   C0013     [(C0145, 1.0), (C0099, 1.0), (C0200, 1.0)]
13   C0014   [(C0097, 1.0), (C0110, 0.99), (C0025, 0.98)]
14   C0015  [(C0123, 0.99), (C0131, 0.99), (C0085, 0.99)]
15   C0016   [(C0040, 1.0), (C0092, 0.99), (C0056, 0.99)]
16   C0017    