In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [3]:
# Merge data
merged_df = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


In [4]:
# Aggregate customer data
customer_profiles = merged_df.groupby("CustomerID").agg({
    "TotalValue": "sum",           # Total spending
    "TransactionID": "count",      # Number of transactions
    "Price_x": "mean",             # Average transactional price per purchase
    "Region": "first",             # Region (assumes one region per customer)
    "Category": lambda x: x.mode()[0],  # Most frequent category
    "TransactionDate": "max"       # Most recent transaction date
}).reset_index()

# Rename columns for clarity
customer_profiles.rename(columns={
    "TotalValue": "TotalSpending",
    "TransactionID": "TransactionCount",
    "Price_x": "AvgTransactionPrice",
}, inplace=True)


In [5]:
# Add CustomerName for reference
customer_profiles = customer_profiles.merge(customers[["CustomerID", "CustomerName"]], on="CustomerID")

# Inspect customer profiles
print("Customer Profiles:")
print(customer_profiles.head())


Customer Profiles:
  CustomerID  TotalSpending  TransactionCount  AvgTransactionPrice  \
0      C0001        3354.52                 5           278.334000   
1      C0002        1862.74                 4           208.920000   
2      C0003        2725.38                 4           195.707500   
3      C0004        5354.88                 8           240.636250   
4      C0005        2034.24                 3           291.603333   

          Region     Category      TransactionDate        CustomerName  
0  South America  Electronics  2024-11-02 17:04:16    Lawrence Carroll  
1           Asia     Clothing  2024-12-03 01:41:41      Elizabeth Lutz  
2  South America   Home Decor  2024-08-24 18:54:04      Michael Rivera  
3  South America        Books  2024-12-23 14:13:52  Kathleen Rodriguez  
4           Asia  Electronics  2024-11-04 00:30:22         Laura Weber  


In [6]:
# One-hot encode categorical features: Region and Category
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # Updated argument name
categorical_features = encoder.fit_transform(customer_profiles[["Region", "Category"]])

# Correctly map feature names to the DataFrame columns
categorical_feature_names = encoder.get_feature_names_out(["Region", "Category"])
categorical_features_df = pd.DataFrame(categorical_features, columns=categorical_feature_names, index=customer_profiles.index)

# Combine the encoded features back into the main dataset
customer_profiles = pd.concat([customer_profiles, categorical_features_df], axis=1)


In [7]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = scaler.fit_transform(customer_profiles[["TotalSpending", "TransactionCount", "AvgTransactionPrice"]])

# Combine numerical and categorical features
combined_features = np.hstack([numerical_features, categorical_features])


In [8]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(combined_features)

# Convert similarity matrix to DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles["CustomerID"], columns=customer_profiles["CustomerID"])


In [9]:
# Display top 3 similar customers for each customer
top_3_similar = {}
for customer_id in similarity_df.index:
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:4]  # Skip the first (self-similarity)
    top_3_similar[customer_id] = list(zip(similar_customers.index, similar_customers.values))

In [17]:
# Prepare output as FirstName_LastName_Lookalike.csv for the first 20 customers (C0001 - C0020)
lookalike_data = [{"CustomerID": cust_id, "Lookalikes": top_3_similar[cust_id]} 
                  for cust_id in list(top_3_similar.keys())[:20]]  # Limit to first 20 customers
lookalike_df = pd.DataFrame(lookalike_data)

# Save to CSV
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)

# Display recommendations for the first 5 customers
for cust_id, similar_customers in list(top_3_similar.items())[:5]:
    print(f"Customer {cust_id}:")
    for similar_cust_id, score in similar_customers:
        print(f"  Similar Customer {similar_cust_id} with Score {score:.2f}")


Customer C0001:
  Similar Customer C0181 with Score 0.94
  Similar Customer C0192 with Score 0.87
  Similar Customer C0190 with Score 0.85
Customer C0002:
  Similar Customer C0088 with Score 0.96
  Similar Customer C0134 with Score 0.91
  Similar Customer C0106 with Score 0.91
Customer C0003:
  Similar Customer C0025 with Score 0.96
  Similar Customer C0031 with Score 0.93
  Similar Customer C0052 with Score 0.92
Customer C0004:
  Similar Customer C0165 with Score 0.97
  Similar Customer C0153 with Score 0.90
  Similar Customer C0087 with Score 0.88
Customer C0005:
  Similar Customer C0140 with Score 0.98
  Similar Customer C0186 with Score 0.98
  Similar Customer C0007 with Score 0.90
