In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
transactions = pd.read_csv("transactions.csv")
products = pd.read_csv("products.csv")
customers = pd.read_csv("customers.csv")

# ðŸ”¹ Step 1: Debug - Check for CustomerID presence in both datasets
print("CustomerIDs in transactions:", transactions["CustomerID"].unique())  # Print unique CustomerIDs in transactions
print("CustomerIDs in customers:", customers["CustomerID"].unique())  # Print unique CustomerIDs in customers

# ðŸ”¹ Step 2: Merge data, ensuring correct Price column is selected
df = transactions.merge(products, on="ProductID", how="left") \
                 .merge(customers[["CustomerID"]], on="CustomerID", how="left")

# ðŸ”¹ Debugging: Check merged columns
print("Columns in merged df:", df.columns)

# ðŸ”¹ Step 3: Check if 'Price' column is correctly assigned
df["Price"] = df["Price_y"]  # Use 'Price_y' from products.csv
df = df.drop(columns=["Price_x", "Price_y"])  # Drop extra Price columns

# ðŸ”¹ Debugging: Check if there are missing CustomerIDs after merge
missing_customers = df[df["CustomerID"].isnull()]
print("Missing CustomerIDs:", missing_customers)

# ðŸ”¹ Step 4: Aggregate Customer Features
customer_features = df.groupby("CustomerID").agg(
    Total_Spending=("TotalValue", "sum"),
    Total_Quantity=("Quantity", "sum"),
    Avg_Price=("Price", "mean"),
    Favorite_Category=("Category", lambda x: x.mode()[0] if not x.mode().empty else "Unknown")
).reset_index()

# One-hot encode 'Favorite_Category'
customer_features = pd.get_dummies(customer_features, columns=["Favorite_Category"], drop_first=True)

# ðŸ”¹ Step 5: Normalize Data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=["CustomerID"]))
customer_features_scaled = pd.DataFrame(scaled_features, columns=customer_features.columns[1:])
customer_features_scaled["CustomerID"] = customer_features["CustomerID"]

# ðŸ”¹ Step 6: Compute Similarity for Lookalike Model
similarity_matrix = cosine_similarity(customer_features_scaled.drop(columns=["CustomerID"]))
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

# ðŸ”¹ Step 7: Customer Segmentation (Example: Finding Most Similar Customers)
def find_similar_customers(customer_id, top_n=5):
    if customer_id not in similarity_df.index:
        return f"CustomerID {customer_id} not found."
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return similar_customers

# Example Usage: Find top 5 similar customers to a given ID
customer_id_example = 12345  # Change this to an actual CustomerID
print(find_similar_customers(customer_id_example))

# ðŸ”¹ Step 8: Save Processed Data
customer_features.to_csv("customer_features.csv", index=False)
customer_features_scaled.to_csv("customer_features_scaled.csv", index=False)
similarity_df.to_csv("customer_similarity.csv")


CustomerIDs in transactions: ['C0199' 'C0146' 'C0127' 'C0087' 'C0070' 'C0188' 'C0195' 'C0008' 'C0157'
 'C0130' 'C0051' 'C0075' 'C0155' 'C0092' 'C0088' 'C0109' 'C0041' 'C0101'
 'C0154' 'C0200' 'C0049' 'C0103' 'C0028' 'C0183' 'C0190' 'C0055' 'C0148'
 'C0035' 'C0120' 'C0017' 'C0145' 'C0004' 'C0010' 'C0065' 'C0132' 'C0068'
 'C0123' 'C0018' 'C0064' 'C0121' 'C0161' 'C0173' 'C0150' 'C0176' 'C0167'
 'C0184' 'C0034' 'C0053' 'C0102' 'C0134' 'C0086' 'C0036' 'C0056' 'C0076'
 'C0194' 'C0124' 'C0024' 'C0139' 'C0133' 'C0012' 'C0128' 'C0158' 'C0164'
 'C0067' 'C0142' 'C0032' 'C0063' 'C0113' 'C0136' 'C0172' 'C0125' 'C0002'
 'C0122' 'C0071' 'C0162' 'C0141' 'C0156' 'C0191' 'C0147' 'C0138' 'C0153'
 'C0043' 'C0135' 'C0091' 'C0168' 'C0069' 'C0066' 'C0186' 'C0023' 'C0105'
 'C0072' 'C0093' 'C0009' 'C0083' 'C0001' 'C0039' 'C0165' 'C0143' 'C0038'
 'C0019' 'C0073' 'C0047' 'C0171' 'C0181' 'C0108' 'C0090' 'C0126' 'C0106'
 'C0166' 'C0192' 'C0074' 'C0048' 'C0006' 'C0119' 'C0193' 'C0027' 'C0079'
 'C0082' 'C0025' 'C011