In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('/content/Customers.csv')
transactions = pd.read_csv('/content/Transactions.csv')
products = pd.read_csv('/content/Products.csv')


# Inspect column names for debugging
print("Customers Columns:", customers.columns)
print("Transactions Columns:", transactions.columns)
print("Products Columns:", products.columns)

# Merge datasets on relevant keys
merged = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Inspect merged DataFrame to confirm structure
print("Merged Columns:", merged.columns)
print(merged.head())


amount_col = 'TotalValue'

# Aggregate transaction history per customer
customer_features = merged.groupby('CustomerID').agg(
    total_spent=(amount_col, 'sum'),
    avg_transaction_value=(amount_col, 'mean'),
    total_transactions=('TransactionID', 'count'),
    most_purchased_category=('Category', lambda x: x.mode().iloc[0] if not x.mode().empty else 'Unknown')
).reset_index()

# Encode categorical features
customer_features = pd.get_dummies(customer_features, columns=['most_purchased_category'], drop_first=True)

# Normalize features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

# Compute Similarity Matrix
similarity_matrix = cosine_similarity(normalized_features)

# Generate Lookalike Recommendations
customer_ids = customer_features['CustomerID'].tolist()
recommendations = {}

for idx, cust_id in enumerate(customer_ids[:20]):  # For customers C0001 - C0020
    similar_customers = sorted(
        [(customer_ids[i], similarity_matrix[idx, i]) for i in range(len(customer_ids)) if i != idx],
        key=lambda x: x[1], reverse=True
    )[:3]
    recommendations[cust_id] = similar_customers

# Create Lookalike.csv
lookalike_data = [{"cust_id": cust, "similar_customers": str(similar_customers)}
                  for cust, similar_customers in recommendations.items()]
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('/content/Lookalike.csv', index=False)

print("Lookalike.csv has been created successfully.")


Customers Columns: Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')
Transactions Columns: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')
Products Columns: Index(['ProductID', 'ProductName', 'Category', 'Price'], dtype='object')
Merged Columns: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10     