In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors

# ======================================================
# 1. Load Data with Column Conflict Resolution
# ======================================================
# Load datasets with proper type casting
customers = pd.read_csv("Customers.csv", parse_dates=["SignupDate"])
products = pd.read_csv("Products.csv").rename(columns={"Price": "ProductPrice"})
transactions = pd.read_csv("Transactions.csv", 
                          parse_dates=["TransactionDate"],
                          dtype={'CustomerID': 'string'})

# Merge datasets
merged_data = (
    transactions
    .merge(products, on="ProductID")
    .merge(customers, on="CustomerID")
)

# ======================================================
# 2. Feature Engineering (Enhanced)
# ======================================================
# Calculate days since last transaction
snapshot_date = pd.Timestamp.now()

customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'TransactionID': 'count',
    'TransactionDate': lambda x: (snapshot_date - x.max()).days,
    'Region': 'first',
    'Category': lambda x: x.value_counts().index[0] if not x.empty else 'Unknown'
}).reset_index()

# Clean column names
customer_features.columns = [
    'CustomerID', 'TotalSpent', 'AvgSpent', 
    'Frequency', 'Recency', 'Region', 'FavoriteCategory'
]

customer_features.to_csv("customer_features.csv", index=False)

# ======================================================
# 3. Data Preprocessing Pipeline
# ======================================================
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Region', 'FavoriteCategory']),
        ('num', StandardScaler(), ['TotalSpent', 'AvgSpent', 'Frequency', 'Recency'])
    ]
)

processed_features = preprocessor.fit_transform(
    customer_features.drop('CustomerID', axis=1)
)

# Manually construct column names for older scikit-learn versions
cat_transformer = preprocessor.named_transformers_['cat']
num_features = ['TotalSpent', 'AvgSpent', 'Frequency', 'Recency']

# Get OneHotEncoder-generated column names
cat_features = cat_transformer.get_feature_names(['Region', 'FavoriteCategory'])

# Combine all column names
all_columns = np.concatenate([cat_features, num_features])

# Create DataFrame and save to CSV
processed_df = pd.DataFrame(processed_features, columns=all_columns)
processed_df.to_csv('processed_features.csv', index=False)

# ======================================================
# 4. Similarity Model (Optimized)
# ======================================================
model = NearestNeighbors(n_neighbors=4, metric='cosine')
model.fit(processed_features)

# ======================================================
# 5. Generate Recommendations (With Error Handling)
# ======================================================
lookalikes = {}
target_customers = [f"C00{i:02d}" for i in range(1, 21)]

for cust_id in target_customers:
    try:
        # Check if customer exists in the data
        mask = customer_features['CustomerID'] == cust_id
        if not mask.any():
            print(f"Customer {cust_id} not found - skipping")
            continue
        
        # Find the index of the target customer in the processed features
        idx = customer_features[mask].index[0]
        distances, indices = model.kneighbors(processed_features[idx:idx+1])

        recommendations = []
        for neighbor_idx, distance in zip(indices[0][1:], distances[0][1:]):  # Skip self
            neighbor_id = customer_features.iloc[neighbor_idx]['CustomerID']
            similarity = 1 - distance
            recommendations.append((neighbor_id, round(similarity, 2)))
        
        lookalikes[cust_id] = recommendations

    except Exception as e:
        print(f"Error processing {cust_id}: {str(e)}")

# ======================================================
# 6. Save Results (Required Format)
# ======================================================
# Create formatted CSV
output_rows = []
for cust_id in target_customers:
    if cust_id not in lookalikes:
        output_rows.append([cust_id, None, None, None, None, None, None])
        continue
    
    recs = lookalikes[cust_id]
    row = [cust_id]
    for rec in recs:
        row.extend(rec)
    
    # Fill empty slots if <3 recommendations
    while len(row) < 7:  # 1 ID + 3*(ID+score) = 7 elements
        row.extend([None, None])
    
    output_rows.append(row)

result_df = pd.DataFrame(output_rows, columns=[
    'CustomerID',
    'Lookalike1', 'Score1',
    'Lookalike2', 'Score2', 
    'Lookalike3', 'Score3'
])

result_df.to_csv("Lookalike.csv", index=False)
