# Task 2


In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [4]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [5]:
# Merge datasets
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [6]:
# Feature engineering: Aggregate customer-level data
customer_features = data.groupby("CustomerID").agg({
    "TotalValue": "sum",    # Total spending
    "Quantity": "sum",      # Total quantity purchased
    "Category": lambda x: x.mode()[0],  # Most frequent category
    "Region": "first",      # Customer region
}).reset_index()

In [7]:
# One-hot encode categorical features (Region, Category)
customer_features = pd.get_dummies(customer_features, columns=["Region", "Category"])

In [8]:
# Normalize numerical features
scaler = StandardScaler()
numeric_cols = ["TotalValue", "Quantity"]
customer_features[numeric_cols] = scaler.fit_transform(customer_features[numeric_cols])


In [9]:
# Compute similarity matrix
feature_matrix = customer_features.drop("CustomerID", axis=1)
similarity_matrix = cosine_similarity(feature_matrix)

In [10]:
# Generate top 3 lookalikes for each customer
lookalikes = {}
for i, customer_id in enumerate(customer_features["CustomerID"]):
    similarity_scores = list(enumerate(similarity_matrix[i]))
    sorted_scores = sorted(similarity_scores, key=lambda x: -x[1])  # Sort by similarity
    top_3 = [(customer_features.loc[j, "CustomerID"], score) 
             for j, score in sorted_scores[1:4]]  # Skip self-match
    lookalikes[customer_id] = top_3

In [11]:
# Create Lookalike.csv for customers C0001-C0020
lookalike_subset = {k: v for k, v in lookalikes.items() if k in customers["CustomerID"][:20].values}
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_subset.keys(),
    "Lookalikes": [str(v) for v in lookalike_subset.values()]
})
lookalike_df.to_csv("Lookalike.csv", index=False)

In [12]:
print("Lookalike.csv has been generated.")

Lookalike.csv has been generated.
