In [1]:
# Import necessary libraries
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
# Merge datasets
merged_df = transactions.merge(customers, on="CustomerID", how="left")
merged_df = merged_df.merge(products, on="ProductID", how="left")

## Feature Engineering

In [4]:
# Aggregating transaction-level data for each customer
customer_features = merged_df.groupby("CustomerID").agg({
    "TotalValue": ["sum", "mean"],        # Total and average spending
    "Quantity": ["sum"],                 # Total quantity purchased
    "Category": lambda x: x.mode()[0],   # Most purchased category
    "Region": "first",                   # Region from customer profile
    "SignupDate": "first"                # Signup date from customer profile
}).reset_index()

customer_features.columns = [
    "CustomerID", "TotalSpending", "AvgSpending", "TotalQuantity",
    "MostPurchasedCategory", "Region", "SignupDate"
]

In [5]:
# Convert signup date to a numerical feature (days since signup)
customer_features["SignupDate"] = pd.to_datetime(customer_features["SignupDate"])
customer_features["DaysSinceSignup"] = (pd.Timestamp.now() - customer_features["SignupDate"]).dt.days

In [6]:
# Drop original signup date
customer_features.drop(columns=["SignupDate"], inplace=True)

In [7]:
# One-hot encode categorical features (Region and MostPurchasedCategory)
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_features[["Region", "MostPurchasedCategory"]]).toarray()

In [8]:
# Combine all features into a single DataFrame
numerical_features = customer_features[["TotalSpending", "AvgSpending", "TotalQuantity", "DaysSinceSignup"]]
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(numerical_features)

In [9]:
# Final feature matrix
feature_matrix = np.hstack((scaled_features, encoded_features))

## Similarity Calculation

In [10]:
# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

## Generating Lookalike Recommendations

In [11]:
# Create a DataFrame to store lookalike results
lookalike_results = []

for idx, customer_id in enumerate(customer_features["CustomerID"]):
    # Get similarity scores for the current customer
    similarities = similarity_matrix[idx]
    similar_customers = sorted(
        [(other_id, score) for other_id, score in zip(customer_features["CustomerID"], similarities) if other_id != customer_id],
        key=lambda x: x[1], reverse=True
    )
    # Select top 3 similar customers
    top_3 = similar_customers[:3]
    lookalike_results.append({
        "CustomerID": customer_id,
        "Lookalikes": [{"CustomerID": c[0], "SimilarityScore": c[1]} for c in top_3]
    })

In [12]:
# Create Lookalike.csv
lookalike_df = pd.DataFrame([
    {"CustomerID": result["CustomerID"], 
     "LookalikeIDs": [entry["CustomerID"] for entry in result["Lookalikes"]], 
     "SimilarityScores": [entry["SimilarityScore"] for entry in result["Lookalikes"]]}
    for result in lookalike_results
])

# Save to CSV
lookalike_df.to_csv("Ajay_K_P_Lookalike.csv", index=False)
