In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv(r"C:\Users\S.Bharathi\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\S.Bharathi\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\S.Bharathi\Downloads\Transactions.csv")

# Merge datasets
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Step 1: Create feature set for customers using transaction and product data
customer_features = data.groupby('CustomerID').agg({
    'Price_x': 'mean',             # Average price of products purchased
    'Quantity': 'sum',           # Total quantity of products purchased
    'TotalValue': 'sum'          # Total transaction value
}).reset_index()

# Step 2: Normalize features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.iloc[:, 1:])

# Step 3: Compute cosine similarity
similarity_matrix = cosine_similarity(normalized_features)

# Step 4: Create a DataFrame for similarity
similarity_df = pd.DataFrame(
    similarity_matrix, 
    index=customer_features['CustomerID'], 
    columns=customer_features['CustomerID']
)

# Step 5: Get top 3 similar customers for the first 20 customers
def get_top_3_similar(customers, similarity_matrix):
    recommendations = {}
    for customer in customers:
        # Sort customers by similarity score, exclude self, and pick top 3
        similar_customers = similarity_matrix.loc[customer].sort_values(ascending=False).head(4)[1:]
        recommendations[customer] = similar_customers
    return recommendations

# Extract the first 20 customers (C0001 - C0020)
first_20_customers = customer_features['CustomerID'][:20]
top_3_lookalikes = get_top_3_similar(first_20_customers, similarity_df)

# Step 6: Save results in the required format: Map<cust_id, List<cust_id, score>>
lookalike_results = []
for customer, similar_customers in top_3_lookalikes.items():
    lookalike_results.append({
        "cust_id": customer,
        "similar_customers": [
            {"cust_id": cust_id, "score": score} for cust_id, score in similar_customers.items()
        ]
    })

# Convert results to a structured CSV format
output_rows = []
for record in lookalike_results:
    customer_id = record["cust_id"]
    for similar_customer in record["similar_customers"]:
        output_rows.append([
            customer_id,
            similar_customer["cust_id"],
            similar_customer["score"]
        ])

# Save to CSV
lookalike_df = pd.DataFrame(output_rows, columns=['CustomerID', 'SimilarCustomerID', 'SimilarityScore'])
lookalike_df.to_csv(r"C:\Users\S.Bharathi\Downloads\Lookalike.csv", index=False)


# Display a sample of the results
print(lookalike_df.head(10))


  CustomerID SimilarCustomerID  SimilarityScore
0      C0001             C0103         0.997573
1      C0001             C0092         0.996879
2      C0001             C0135         0.992736
3      C0002             C0029         0.999854
4      C0002             C0077         0.996104
5      C0002             C0157         0.995478
6      C0003             C0111         0.998487
7      C0003             C0190         0.996656
8      C0003             C0038         0.990133
9      C0004             C0165         0.998390
