In [None]:
# Task 3: Customer Segmentation
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Create Customer Profiles
# Aggregate transaction data to get features for each customer
customer_profiles = merged_df.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],  # Total spending and average transaction value
    'Quantity': 'sum',             # Total quantity purchased
    'ProductID': 'nunique',        # Number of unique products purchased
    'Category': lambda x: x.mode()[0]  # Most common product category
}).reset_index()

# Rename columns for clarity
customer_profiles.columns = ['CustomerID', 'TotalSpending', 'AvgTransactionValue', 
                             'TotalQuantity', 'UniqueProducts', 'PreferredCategory']

# One-hot encode the PreferredCategory for similarity calculation
customer_profiles = pd.get_dummies(customer_profiles, columns=['PreferredCategory'], drop_first=True)

# Standardize the numerical features
scaler = StandardScaler()
numerical_features = ['TotalSpending', 'AvgTransactionValue', 'TotalQuantity', 'UniqueProducts']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

# Step 2: Calculate Similarity
# Compute pairwise cosine similarity for all customers
customer_similarity = cosine_similarity(customer_profiles.drop(columns=['CustomerID']))
customer_similarity_df = pd.DataFrame(customer_similarity, 
                                      index=customer_profiles['CustomerID'], 
                                      columns=customer_profiles['CustomerID'])

# Step 3: Generate Top 3 Recommendations for Customers C0001-C0020
lookalikes = {}
target_customers = customer_profiles['CustomerID'][:20]

for cust_id in target_customers:
    # Get similarity scores for the target customer
    similar_customers = customer_similarity_df[cust_id].sort_values(ascending=False).drop(cust_id)
    
    # Get top 3 similar customers
    top_3 = similar_customers.head(3).reset_index()
    top_3.columns = ['CustomerID', 'SimilarityScore']
    lookalikes[cust_id] = top_3.values.tolist()

# Step 4: Save Lookalikes to CSV
lookalikes_df = pd.DataFrame.from_dict(lookalikes, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])
lookalikes_df.to_csv("/mnt/data/Lookalike.csv", index_label="CustomerID")

lookalikes_df.head()
