In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [3]:
# Load Data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [5]:
# Display first few rows of each dataset
print("Customers Data:")
print(customers.head())
print("\nProducts Data:")
print(products.head())
print("\nTransactions Data:")
print(transactions.head())

Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127 

In [7]:
# Convert date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [11]:
# Aggregate Transaction Data
customer_summary = transactions.groupby('CustomerID').agg(
    total_purchases=('TransactionID', 'count'),
    total_spending=('TotalValue', 'sum'),
    avg_order_value=('TotalValue', 'mean'),
    product_diversity=('ProductID', pd.Series.nunique),
    last_purchase=('TransactionDate', 'max')
).reset_index()


Unnamed: 0,CustomerID,total_purchases,total_spending,avg_order_value,product_diversity,last_purchase
0,C0001,5,3354.52,670.904000,5,2024-11-02 17:04:16
1,C0002,4,1862.74,465.685000,4,2024-12-03 01:41:41
2,C0003,4,2725.38,681.345000,4,2024-08-24 18:54:04
3,C0004,8,5354.88,669.360000,8,2024-12-23 14:13:52
4,C0005,3,2034.24,678.080000,3,2024-11-04 00:30:22
...,...,...,...,...,...,...
194,C0196,4,4982.88,1245.720000,3,2024-12-15 03:43:35
195,C0197,3,1928.65,642.883333,3,2024-12-27 18:20:31
196,C0198,2,931.83,465.915000,2,2024-10-04 18:31:12
197,C0199,4,1979.28,494.820000,4,2024-10-26 00:01:58


In [13]:
# Merge with Customers Data
customer_summary = customer_summary.merge(customers, on='CustomerID', how='left')
customer_summary['signup_age'] = (pd.to_datetime('today') - customer_summary['SignupDate']).dt.days
customer_summary['recency'] = (pd.to_datetime('today') - customer_summary['last_purchase']).dt.days
customer_summary.drop(columns=['SignupDate', 'last_purchase'], inplace=True)
customer_summary

Unnamed: 0,CustomerID,total_purchases,total_spending,avg_order_value,product_diversity,CustomerName,Region,signup_age,recency
0,C0001,5,3354.52,670.904000,5,Lawrence Carroll,South America,934,87
1,C0002,4,1862.74,465.685000,4,Elizabeth Lutz,Asia,1081,57
2,C0003,4,2725.38,681.345000,4,Michael Rivera,South America,328,157
3,C0004,8,5354.88,669.360000,8,Kathleen Rodriguez,South America,843,37
4,C0005,3,2034.24,678.080000,3,Laura Weber,Asia,898,86
...,...,...,...,...,...,...,...,...,...
194,C0196,4,4982.88,1245.720000,3,Laura Watts,Europe,967,45
195,C0197,3,1928.65,642.883333,3,Christina Harvey,Europe,680,32
196,C0198,2,931.83,465.915000,2,Rebecca Ray,Europe,1067,116
197,C0199,4,1979.28,494.820000,4,Andrea Jenkins,Europe,788,95


In [37]:
# Ensure Customer IDs are of the same type
customers['CustomerID'] = customers['CustomerID'].astype(str)
customer_summary['CustomerID'] = customer_summary['CustomerID'].astype(str)

In [39]:
# Normalize Features
features = ['total_purchases', 'total_spending', 'avg_order_value', 'product_diversity', 'signup_age', 'recency']
scaler = MinMaxScaler()

In [41]:
# Handle missing values before normalization
customer_summary.dropna(subset=features, inplace=True)
customer_summary[features] = customer_summary[features].replace([np.inf, -np.inf], np.nan).fillna(0)
customer_summary[features] = scaler.fit_transform(customer_summary[features])

In [43]:
# Check if customer_summary has valid data
print(customer_summary.head())

  CustomerID  total_purchases  total_spending  avg_order_value  \
0      C0001              0.4        0.308942         0.474336   
1      C0002              0.3        0.168095         0.308940   
2      C0003              0.3        0.249541         0.482751   
3      C0004              0.7        0.497806         0.473092   
4      C0005              0.2        0.184287         0.480120   

   product_diversity        CustomerName         Region  signup_age   recency  
0           0.444444    Lawrence Carroll  South America    0.842204  0.152778  
1           0.333333      Elizabeth Lutz           Asia    0.979458  0.069444  
2           0.333333      Michael Rivera  South America    0.276377  0.347222  
3           0.777778  Kathleen Rodriguez  South America    0.757236  0.013889  
4           0.222222         Laura Weber           Asia    0.808590  0.150000  


In [45]:
# Build Lookalike Model using Nearest Neighbors
if not customer_summary.empty:
    knn = NearestNeighbors(n_neighbors=min(4, len(customer_summary)), metric='cosine')
    knn.fit(customer_summary[features])
else:
    print("Error: customer_summary is empty after preprocessing.")

In [47]:
# Find Lookalikes for First 20 Customers
lookalike_data = {}
for cust_id in customers['CustomerID'][:min(20, len(customers))]:
    if cust_id in customer_summary['CustomerID'].values:
        cust_index = customer_summary[customer_summary['CustomerID'] == cust_id].index[0]
        distances, indices = knn.kneighbors(customer_summary.loc[[cust_index], features])
        similar_customers = customer_summary.iloc[indices[0][1:]]  # Exclude itself
        scores = 1 - distances[0][1:]  # Convert distance to similarity
        lookalike_data[cust_id] = list(zip(similar_customers['CustomerID'], scores))

In [49]:
# Check if lookalike_data is populated
print("Lookalike Data Size:", len(lookalike_data))

Lookalike Data Size: 20


In [51]:
# Save to CSV only if data exists
if lookalike_data:
    lookalike_df = pd.DataFrame([(k, v[0], v[1]) for k, vals in lookalike_data.items() for v in vals], 
                                 columns=['CustomerID', 'Lookalike_CustomerID', 'Similarity_Score'])
    lookalike_df.to_csv('Lookalike.csv', index=False)
    print("Lookalike Model Completed. Lookalike.csv generated successfully.")
else:
    print("Error: No lookalike data generated. Check input datasets.")

Lookalike Model Completed. Lookalike.csv generated successfully.
