In [44]:
# FirstName_LastName_Lookalike.ipynb

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

from datetime import date

In [45]:
# Load and preprocess data
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

customers_df = customers_df.fillna({'Region': 'Unknown'})
products_df = products_df.dropna()  # Remove products with missing information
transactions_df = transactions_df.dropna()

# Convert date columns to datetime
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

# Remove duplicates
customers_df = customers_df.drop_duplicates(subset='CustomerID')
products_df = products_df.drop_duplicates(subset='ProductID')
transactions_df = transactions_df.drop_duplicates(subset='TransactionID')

# Ensure correct data types
transactions_df['Quantity'] = transactions_df['Quantity'].astype(int)
transactions_df['TotalValue'] = transactions_df['TotalValue'].astype(float)
products_df['Price'] = products_df['Price'].astype(float)

# Ensure consistency between datasets
valid_customers = set(customers_df['CustomerID'])
valid_products = set(products_df['ProductID'])
transactions_df = transactions_df[transactions_df['CustomerID'].isin(valid_customers) &
                                  transactions_df['ProductID'].isin(valid_products)]

print(customers_df.shape)
print(products_df.shape)
print(transactions_df.shape)


FileNotFoundError: [Errno 2] No such file or directory: 'Customers.csv'

In [38]:
# Feature engineering
def engineer_features(customers, transactions, products):
    current_date = pd.Timestamp(date.today())

    agg_transactions = transactions.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': 'sum',
        'Quantity': 'sum',
        'TransactionDate': ['min', 'max']
    })
    agg_transactions.columns = ['transaction_count', 'total_spend', 'total_quantity', 'first_purchase', 'last_purchase']

    agg_transactions['recency'] = (current_date - agg_transactions['last_purchase']).dt.days
    agg_transactions['frequency'] = agg_transactions['transaction_count']
    agg_transactions['avg_spend'] = agg_transactions['total_spend'] / agg_transactions['transaction_count']
    agg_transactions['customer_lifetime'] = (agg_transactions['last_purchase'] - agg_transactions['first_purchase']).dt.days

    customer_features = customers.merge(agg_transactions, left_on='CustomerID', right_index=True, how='left')
    customer_features = pd.get_dummies(customer_features, columns=['Region'], prefix='region')
    customer_features['days_since_signup'] = (current_date - customer_features['SignupDate']).dt.days
    customer_features = customer_features.drop(['CustomerName', 'SignupDate', 'first_purchase', 'last_purchase'], axis=1)
    customer_features = customer_features.fillna(0)

    return customer_features

In [39]:

# Apply feature engineering
features = engineer_features(customers_df, transactions_df, products_df)
features.head()

Unnamed: 0,CustomerID,transaction_count,total_spend,total_quantity,recency,frequency,avg_spend,customer_lifetime,region_Asia,region_Europe,region_North America,region_South America,days_since_signup
0,C0001,5.0,3354.52,12.0,85.0,5.0,670.904,288.0,False,False,False,True,932
1,C0002,4.0,1862.74,10.0,54.0,4.0,465.685,278.0,True,False,False,False,1079
2,C0003,4.0,2725.38,14.0,155.0,4.0,681.345,188.0,False,False,False,True,326
3,C0004,8.0,5354.88,23.0,34.0,8.0,669.36,299.0,False,False,False,True,841
4,C0005,3.0,2034.24,7.0,83.0,3.0,678.08,233.0,True,False,False,False,896


In [40]:
# Normalization (fixed)
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features.drop('CustomerID', axis=1))
print()
print(len(normalized_features))
print(len(normalized_features[0]))


200
12


In [41]:
# Create mappings
customer_id_to_index = {cid: i for i, cid in enumerate(features['CustomerID'])}
index_to_customer_id = {i: cid for cid, i in customer_id_to_index.items()}

# Calculate similarity
similarity_matrix = cosine_similarity(normalized_features)
# print(similarity_matrix[0])

In [42]:
def get_top_3_lookalikes(customer_id):
    idx = customer_id_to_index[customer_id]
    similarities = similarity_matrix[idx]
    top_3_indices = np.argsort(similarities)[-4:-1][::-1]  # Exclude self
    return [(index_to_customer_id[i], round(similarities[i], 4)) for i in top_3_indices]

# Generate results for first 20 customers
lookalike_results = {
    f'C{cust_id:04d}': get_top_3_lookalikes(f'C{cust_id:04d}')
    for cust_id in range(1, 21)
}

print(lookalike_results)

{'C0001': [('C0152', 0.9834), ('C0107', 0.9289), ('C0174', 0.9257)], 'C0002': [('C0159', 0.9112), ('C0134', 0.9053), ('C0005', 0.8881)], 'C0003': [('C0052', 0.9249), ('C0129', 0.8505), ('C0085', 0.7609)], 'C0004': [('C0113', 0.9818), ('C0165', 0.9771), ('C0099', 0.9547)], 'C0005': [('C0159', 0.9766), ('C0027', 0.9241), ('C0007', 0.8937)], 'C0006': [('C0187', 0.9049), ('C0158', 0.8888), ('C0168', 0.8858)], 'C0007': [('C0005', 0.8937), ('C0027', 0.8719), ('C0040', 0.86)], 'C0008': [('C0098', 0.9006), ('C0156', 0.8979), ('C0065', 0.8616)], 'C0009': [('C0121', 0.9526), ('C0063', 0.8505), ('C0060', 0.8468)], 'C0010': [('C0086', 0.9003), ('C0019', 0.8764), ('C0121', 0.8554)], 'C0011': [('C0174', 0.937), ('C0133', 0.924), ('C0001', 0.9035)], 'C0012': [('C0013', 0.9416), ('C0195', 0.9269), ('C0039', 0.9139)], 'C0013': [('C0087', 0.9463), ('C0012', 0.9416), ('C0039', 0.8891)], 'C0014': [('C0058', 0.8551), ('C0073', 0.8542), ('C0097', 0.8349)], 'C0015': [('C0036', 0.9008), ('C0042', 0.8689), ('C

In [43]:
# Save to CSV
pd.DataFrame.from_dict(lookalike_results, orient='index').to_csv('ANIL_JHA_Lookalike.csv')