In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
import numpy as np

In [2]:
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [3]:
transaction_features = transactions_df.groupby('CustomerID').agg({
    'TransactionID': 'count',
    'TotalValue': ['sum', 'mean', 'std'],
    'Quantity': ['sum', 'mean']
})

In [4]:
transaction_features.columns = [f"{col[0]}_{col[1]}" for col in transaction_features.columns]


In [5]:
category_features = transactions_df.merge(products_df, on='ProductID')\
    .pivot_table(index='CustomerID', columns='Category', values='TotalValue', aggfunc='sum', fill_value=0)

In [6]:
features = pd.concat([transaction_features, category_features], axis=1).fillna(0)

In [7]:
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

In [8]:
target_ids = [f'C{str(i).zfill(4)}' for i in range(1, 21)]
lookalikes = {}

In [9]:
for cust_id in target_ids:
    if cust_id not in features.index:
        continue
    cust_vector = normalized_features[features.index.get_loc(cust_id)].reshape(1, -1)
    similarities = 1 - cdist(cust_vector, normalized_features, metric='cosine').flatten()
    similarities[features.index.get_loc(cust_id)] = -np.inf
    top_indices = similarities.argsort()[-3:][::-1]
    lookalikes[cust_id] = [
        (features.index[idx], round(float(similarities[idx]), 3)) for idx in top_indices
    ]

In [10]:
lookalike_rows = [
    {'CustomerID': cust_id, 'Lookalikes': ';'.join([f"{c},{s}" for c, s in similar])}
    for cust_id, similar in lookalikes.items()
]

In [11]:
pd.DataFrame(lookalike_rows).to_csv('Lookalike.csv', index=False)

print("Lookalike model complete. Generated 'Lookalike.csv'.")

Lookalike model complete. Generated 'Lookalike.csv'.
