In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

# Step 1: Load the datasets
customers_df = pd.read_csv(r"C:\Users\akhil\OneDrive\Desktop\Customers.csv")
transactions_df = pd.read_csv(r"C:\Users\akhil\OneDrive\Desktop\Transactions.csv")
products_df = pd.read_csv(r"C:\Users\akhil\OneDrive\Desktop\Products.csv")

# Check the columns in customers_df to see what features are available
print(customers_df.columns)

# Step 2: Merge Transactions with Products to get product information
merged_df = pd.merge(transactions_df, products_df, on='ProductID', how='left')

# Step 3: Create aggregated features for customers: Total spending and purchase frequency
customer_spending = merged_df.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_frequency = merged_df.groupby('CustomerID')['TransactionID'].nunique().reset_index()

# Step 4: Merge the customer spending and frequency features with the customer profile
customer_profile = pd.merge(customers_df, customer_spending, on='CustomerID', how='left')
customer_profile = pd.merge(customer_profile, customer_frequency, on='CustomerID', how='left')

# Handle missing values (e.g., customers who haven't made a purchase)
customer_profile['TotalValue'] = customer_profile['TotalValue'].fillna(0)
customer_profile['TransactionID'] = customer_profile['TransactionID'].fillna(0)

# Step 5: Feature selection - Check available columns and adjust the features accordingly
# Print the columns to debug and ensure we're using the correct ones
print(customer_profile.columns)

# Assuming 'Age' doesn't exist, we will remove it and only use 'TotalValue' and 'TransactionID'
customer_features = customer_profile[['TotalValue', 'TransactionID']]

# Step 6: Normalize/Standardize the numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features)

# Step 7: Calculate the cosine similarity matrix between customers
similarity_matrix = cosine_similarity(scaled_features)

# Step 8: Create a function to get the top N similar customers
def get_top_n_similar_customers(customer_id, top_n=3):
    # Find the index of the customer
    customer_index = customer_profile[customer_profile['CustomerID'] == customer_id].index[0]
    
    # Get the similarity scores for the customer
    similarity_scores = similarity_matrix[customer_index]
    
    # Sort customers by similarity score, excluding the customer themselves
    similar_customer_indices = similarity_scores.argsort()[-top_n-1:-1][::-1]
    
    # Retrieve the customer IDs and similarity scores
    similar_customers = [(customer_profile.iloc[i]['CustomerID'], similarity_scores[i]) for i in similar_customer_indices]
    
    return similar_customers

# Step 9: Generate lookalike recommendations for the first 20 customers (C0001 to C0020)
lookalike_map = defaultdict(list)

# Loop through the first 20 customers
for customer_id in customer_profile['CustomerID'][:20]:
    similar_customers = get_top_n_similar_customers(customer_id)
    lookalike_map[customer_id] = similar_customers

# Step 10: Prepare the recommendations for output
lookalike_data = []

for customer_id, similar_customers in lookalike_map.items():
    for similar_customer_id, score in similar_customers:
        lookalike_data.append([customer_id, similar_customer_id, score])

# Step 11: Convert the data into a DataFrame and save to CSV
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
lookalike_df.to_csv(r"C:\Users\akhil\OneDrive\Desktop\Lookalike.csv", index=False)

# Show the first few rows of the generated Lookalike.csv
print(lookalike_df.head())


Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')
Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate', 'TotalValue',
       'TransactionID'],
      dtype='object')
  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0157         1.000000
1      C0001               C0152         1.000000
2      C0001               C0164         1.000000
3      C0002               C0029         0.999816
4      C0002               C0199         0.999488
