In [43]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from datetime import datetime

# Step 1: Load Data
customers_file = "Customers.csv"
products_file = "Products.csv"
transactions_file = "Transactions.csv"

customers = pd.read_csv(customers_file)
products = pd.read_csv(products_file)
transactions = pd.read_csv(transactions_file)

# Step 2: Data Integration
# Merge transactions with products to get product details
transactions = transactions.merge(products, on='ProductID', how='left')

# Aggregate transaction data at the customer level
customer_transactions = transactions.groupby('CustomerID').agg(
    TotalSpend=('TotalValue', 'sum'),
    AvgPurchasePrice=('Price_x', 'mean'),
    PreferredCategory=('Category', lambda x: x.mode()[0]),  # Most frequent category
).reset_index()

# Merge customer-level transaction data with customers
customers = customers.merge(customer_transactions, on='CustomerID', how='left')

# Step 3: Feature Engineering
# Encode categorical features
le_region = LabelEncoder()
le_category = LabelEncoder()
customers['RegionEncoded'] = le_region.fit_transform(customers['Region'])
customers['PreferredCategoryEncoded'] = le_category.fit_transform(customers['PreferredCategory'])

# Convert 'SignupDate' to days since signup
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
current_date = datetime.now()
customers['DaysSinceSignup'] = (current_date - customers['SignupDate']).dt.days

# Fill missing transaction values with 0 (for customers without transactions)
customers['TotalSpend'] = customers['TotalSpend'].fillna(0)
customers['AvgPurchasePrice'] = customers['AvgPurchasePrice'].fillna(0)
customers['PreferredCategoryEncoded'] = customers['PreferredCategoryEncoded'].fillna(-1)

# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ['TotalSpend', 'AvgPurchasePrice', 'DaysSinceSignup']
customers[numerical_cols] = scaler.fit_transform(customers[numerical_cols])

# Step 4: Similarity Calculation
# Select features for similarity calculation
feature_cols = ['RegionEncoded', 'DaysSinceSignup', 'TotalSpend', 'AvgPurchasePrice', 'PreferredCategoryEncoded']
features = customers[feature_cols]

# Compute cosine similarity
similarity_matrix = cosine_similarity(features)

# Step 5: Identify Top 3 Lookalikes for First 20 Customers
lookalike_map = {}
first_20_customers = customers['CustomerID'][:20]
for i, customer_id in enumerate(customers['CustomerID']):
    # Skip if not in the first 20 customers
    if customer_id not in first_20_customers.values:
        continue

    # Get similarity scores for the customer
    scores = list(enumerate(similarity_matrix[i]))
    # Sort by similarity score in descending order, excluding the customer itself
    top_matches = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]
    # Map customer ID to top matches and scores
    lookalike_map[customer_id] = [(customers['CustomerID'][j], round(score, 3)) for j, score in top_matches]

# Step 6: Create Lookalike.csv
output = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])
output.to_csv('Aman_Biswakarma_Lookalike.csv', index=False)

print("Aman_Biswakarma_Lookalike.csv created successfully!")


Lookalike.csv created successfully!
