In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("dataset/dataset_merged.csv", parse_dates=['TransactionDate', 'SignupDate'])

In [3]:
customer_category_spend = df.groupby(['CustomerID', 'Category'])['TotalValue'].sum().unstack(fill_value=0)

In [4]:
customer_category_spend

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,114.60,0.00,2827.30,412.62
C0002,0.00,1025.46,0.00,837.28
C0003,0.00,122.36,1385.20,1217.82
C0004,1888.48,0.00,1355.74,2110.66
C0005,0.00,0.00,1180.38,853.86
...,...,...,...,...
C0196,1310.67,1585.36,0.00,2086.85
C0197,0.00,0.00,914.92,1013.73
C0198,0.00,904.84,26.99,0.00
C0199,0.00,0.00,594.38,1384.90


In [5]:
region_dummies = pd.get_dummies(df.drop_duplicates(subset=['CustomerID'])[['CustomerID', 'Region']], 
                                columns=['Region']).set_index('CustomerID')
region_dummies

Unnamed: 0_level_0,Region_Asia,Region_Europe,Region_North America,Region_South America
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0199,False,True,False,False
C0146,True,False,False,False
C0127,False,True,False,False
C0087,False,False,False,True
C0070,False,True,False,False
...,...,...,...,...
C0058,False,False,True,False
C0095,False,False,False,True
C0151,False,False,False,True
C0078,True,False,False,False


In [6]:
features = customer_category_spend.join(region_dummies, how='left')

In [7]:
features

Unnamed: 0_level_0,Books,Clothing,Electronics,Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C0001,114.60,0.00,2827.30,412.62,False,False,False,True
C0002,0.00,1025.46,0.00,837.28,True,False,False,False
C0003,0.00,122.36,1385.20,1217.82,False,False,False,True
C0004,1888.48,0.00,1355.74,2110.66,False,False,False,True
C0005,0.00,0.00,1180.38,853.86,True,False,False,False
...,...,...,...,...,...,...,...,...
C0196,1310.67,1585.36,0.00,2086.85,False,True,False,False
C0197,0.00,0.00,914.92,1013.73,False,True,False,False
C0198,0.00,904.84,26.99,0.00,False,True,False,False
C0199,0.00,0.00,594.38,1384.90,False,True,False,False


In [8]:
features.isna().sum()

Books                   0
Clothing                0
Electronics             0
Home Decor              0
Region_Asia             0
Region_Europe           0
Region_North America    0
Region_South America    0
dtype: int64

In [9]:
feature_matrix = features.values

In [10]:
feature_matrix

array([[114.6, 0.0, 2827.3, ..., False, False, True],
       [0.0, 1025.46, 0.0, ..., False, False, False],
       [0.0, 122.36, 1385.2, ..., False, False, True],
       ...,
       [0.0, 904.84, 26.99, ..., True, False, False],
       [0.0, 0.0, 594.38, ..., True, False, False],
       [1665.6, 2225.46, 239.7, ..., False, False, False]],
      shape=(199, 8), dtype=object)

In [11]:
def dot_product(v1, v2):
    dp = 0
    for x, y in zip(v1, v2):
        dp += x * y
    return dp

def vector_norm(v):
    total = 0
    for x in v:
        total += x**2
    return total**0.5

def cosine_similarity(v1, v2):
    dp = dot_product(v1, v2)
    norm_v1 = vector_norm(v1)
    norm_v2 = vector_norm(v2)
    
    if norm_v1 == 0 or norm_v2 == 0:
        return 0.0
    
    return dp / (norm_v1 * norm_v2)

In [12]:
num_customers = feature_matrix.shape[0]
similarity_matrix = np.zeros((num_customers, num_customers))

In [13]:
for i in range(num_customers):
    for j in range(num_customers):
        if i == j:
            similarity_matrix[i][j] = 1.0  # 1 because with itself
        else:
            similarity_matrix[i][j] = cosine_similarity(feature_matrix[i], feature_matrix[j])

In [14]:
customer_ids = features.index.tolist()

In [15]:
lookalike_map = {}

for cust_id in ['C{:04d}'.format(i) for i in range(1, 21)]:  # "C0001" to "C0020"
    if cust_id not in customer_ids:
        lookalike_map[cust_id] = []
        continue

    idx = customer_ids.index(cust_id)
    
    # Extract their similarity scores
    scores = similarity_matrix[idx]
    sorted_scores = sorted(
        [(customer_ids[i], s) for i, s in enumerate(scores) if i != idx],
        key=lambda x: x[1],
        reverse=True
    )
    top_3 = sorted_scores[:3]
    lookalike_map[cust_id] = top_3

In [16]:
lookalike_map

{'C0001': [('C0140', np.float64(0.9969870705155506)),
  ('C0091', np.float64(0.9936719959001256)),
  ('C0069', np.float64(0.9913898123170012))],
 'C0002': [('C0134', np.float64(0.9988524848995728)),
  ('C0143', np.float64(0.9984281894152102)),
  ('C0159', np.float64(0.9905512052800601))],
 'C0003': [('C0007', np.float64(0.997757651887942)),
  ('C0163', np.float64(0.9976789604264621)),
  ('C0005', np.float64(0.993312702863571))],
 'C0004': [('C0075', np.float64(0.9943484807181324)),
  ('C0146', np.float64(0.9881682358538834)),
  ('C0090', np.float64(0.9757451550895494))],
 'C0005': [('C0163', np.float64(0.9968853577462633)),
  ('C0007', np.float64(0.994510948615991)),
  ('C0003', np.float64(0.993312702863571))],
 'C0006': [('C0185', np.float64(0.9934704146817561)),
  ('C0169', np.float64(0.9919864846415722)),
  ('C0081', np.float64(0.9901268534243352))],
 'C0007': [('C0163', np.float64(0.999664915876105)),
  ('C0003', np.float64(0.997757651887942)),
  ('C0005', np.float64(0.994510948615

In [17]:
output_rows = []
for cust, lookalikes in lookalike_map.items():
    output_rows.append({
        'CustomerID': cust,
        'Lookalikes': str(lookalikes)
    })

In [18]:
output_rows

[{'CustomerID': 'C0001',
  'Lookalikes': "[('C0140', np.float64(0.9969870705155506)), ('C0091', np.float64(0.9936719959001256)), ('C0069', np.float64(0.9913898123170012))]"},
 {'CustomerID': 'C0002',
  'Lookalikes': "[('C0134', np.float64(0.9988524848995728)), ('C0143', np.float64(0.9984281894152102)), ('C0159', np.float64(0.9905512052800601))]"},
 {'CustomerID': 'C0003',
  'Lookalikes': "[('C0007', np.float64(0.997757651887942)), ('C0163', np.float64(0.9976789604264621)), ('C0005', np.float64(0.993312702863571))]"},
 {'CustomerID': 'C0004',
  'Lookalikes': "[('C0075', np.float64(0.9943484807181324)), ('C0146', np.float64(0.9881682358538834)), ('C0090', np.float64(0.9757451550895494))]"},
 {'CustomerID': 'C0005',
  'Lookalikes': "[('C0163', np.float64(0.9968853577462633)), ('C0007', np.float64(0.994510948615991)), ('C0003', np.float64(0.993312702863571))]"},
 {'CustomerID': 'C0006',
  'Lookalikes': "[('C0185', np.float64(0.9934704146817561)), ('C0169', np.float64(0.9919864846415722)), 

In [19]:
lookalike_df = pd.DataFrame(output_rows)
lookalike_df.to_csv("Sanyog_Mishra_Lookalike.csv", index=False)