In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer

# Load the datasets

In [2]:
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')
customers_df = pd.read_csv('Customers.csv')

# Convert TransactionDate and SignupDate to datetime objects

In [3]:
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])

# Feature Engineering

In [12]:
merged_data = pd.merge(transactions_df, products_df, on='ProductID')
merged_data = pd.merge(merged_data, customers_df, on='CustomerID')

In [14]:
merged_data['SignupYear'] = pd.to_datetime(customers_df['SignupDate']).dt.year
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'SignupYear': 'first',
    'Region': 'first'
})

#  One-Hot Encoding for Region

In [15]:
customer_features = pd.get_dummies(customer_features, columns=['Region'])

# Normalize Features

In [25]:
import numpy as np
print(np.isnan(scaled_features).any())  # Check for NaNs
print(np.isinf(scaled_features).any())

True
False


In [26]:
scaled_features = np.nan_to_num(scaled_features)

In [27]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')  # Choose strategy (mean, median, etc.)
scaled_features = imputer.fit_transform(scaled_features)


In [19]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features)

# Similarity Matrix

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(scaled_features)

# Function to get top 3 lookalike

In [29]:
def get_lookalikes(customer_id, similarity_matrix, top_n=3):
    customer_index = customer_features.index.get_loc(customer_id)
    similarities = similarity_matrix[customer_index]
    top_indices = np.argsort(similarities)[-top_n-1:-1][::-1]
    top_customers = customer_features.iloc[top_indices].index
    top_scores = similarities[top_indices]
    return list(zip(top_customers, top_scores))

# Generate Lookalike.csv

In [30]:
lookalike_map = {}
for customer_id in customer_features.index[:20]:
    lookalike_map[customer_id] = get_lookalikes(customer_id, similarity_matrix)

# Save to CSV

In [32]:
import csv
with open('Lookalike.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['CustomerID', 'LookalikeID', 'SimilarityScore'])
    for cust_id, lookalikes in lookalike_map.items():
        for lookalike_id, score in lookalikes:
            writer.writerow([cust_id, lookalike_id, score])