In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score

In [7]:
# Task 1: Load and Explore Data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets for comprehensive analysis
merged = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Feature engineering using Price_x
customer_features = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',       # Total spending by the customer
    'Quantity': 'sum',         # Total quantity purchased
    'Price_x': 'mean'          # Average transaction price
}).reset_index()

# Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[['TotalValue', 'Quantity', 'Price_x']])

# Update column names for clarity
customer_features.columns = ['CustomerID', 'TotalValue', 'Quantity', 'AvgTransactionPrice']

# Compute similarity
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Find top 3 lookalikes for customers C0001-C0020
lookalikes = {}
for customer in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]
    lookalikes[customer] = list(zip(similar_customers.index, similar_customers.values))

lookalike_df = pd.DataFrame.from_dict(lookalikes, orient='index', columns=['Top1', 'Top2', 'Top3'])
lookalike_df.to_csv("Gokul_Manoj_Lookalike.csv")
