In [2]:
import pandas as pd

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Display basic information about each dataset
customers_info = customers.info()
products_info = products.info()
transactions_info = transactions.info()

# Display the first few rows of each dataset
customers_head = customers.head()
products_head = products.head()
transactions_head = transactions.head()

customers_info, customers_head, products_info, products_head, transactions_info, transactions_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  


(None,
   CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
 None,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
 None,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  20

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Merge datasets
merged = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Create customer profiles
customer_profiles = merged.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "Quantity": "sum",    # Total quantity purchased
    "Category": lambda x: x.mode()[0],  # Most purchased category
    "Region": "first"     # Customer's region
}).reset_index()

# Convert categorical columns to numerical for similarity calculations
customer_profiles_encoded = pd.get_dummies(customer_profiles, columns=["Category", "Region"])

# Normalize numerical columns for consistent scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical_cols = ["TotalValue", "Quantity"]
customer_profiles_encoded[numerical_cols] = scaler.fit_transform(customer_profiles_encoded[numerical_cols])

# Calculate similarity scores for the first 20 customers
subset_customers = customer_profiles_encoded.iloc[:20]
similarity_matrix = cosine_similarity(subset_customers.iloc[:, 1:], customer_profiles_encoded.iloc[:, 1:])

# Find top 3 lookalikes for each customer
lookalike_dict = {}
for idx, customer in enumerate(subset_customers["CustomerID"]):
    # Sort scores and get top 3 excluding self (index 0)
    top_indices = np.argsort(-similarity_matrix[idx, :])[1:4]
    lookalikes = [
        (customer_profiles_encoded.iloc[i]["CustomerID"], similarity_matrix[idx, i])
        for i in top_indices
    ]
    lookalike_dict[customer] = lookalikes

# Prepare Lookalike.csv
lookalike_output = pd.DataFrame([
    {"cust_id": customer, "lookalikes": lookalikes}
    for customer, lookalikes in lookalike_dict.items()
])

lookalike_path = "Lookalike.csv"
lookalike_output.to_csv(lookalike_path, index=False)

lookalike_path


'Lookalike.csv'