In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
df = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [4]:
agg_features = df.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'Quantity': ['sum', 'mean'],
    'TransactionDate': ['count'],
    'Category': lambda x: x.mode()[0],
    'Region': 'first'
})
agg_features.columns = ['_'.join(col) for col in agg_features.columns]
agg_features.reset_index(inplace=True)

In [5]:
label_encoders = {}
for col in ['Category_<lambda>', 'Region_first']:
    le = LabelEncoder()
    agg_features[col] = le.fit_transform(agg_features[col])
    label_encoders[col] = le

In [6]:
scaler = StandardScaler()
numeric_cols = ['TotalValue_sum', 'TotalValue_mean', 'Quantity_sum', 'Quantity_mean', 'TransactionDate_count']
agg_features[numeric_cols] = scaler.fit_transform(agg_features[numeric_cols])

In [7]:
feature_matrix = agg_features.set_index('CustomerID')
similarity_matrix = cosine_similarity(feature_matrix)

In [8]:
# Find top 3 similar customers for first 20 customers
customer_ids = agg_features['CustomerID'].head(20)
lookalike_map = {}
for idx, customer in enumerate(customer_ids):
    similarities = list(enumerate(similarity_matrix[idx]))
    sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[1:4]
    lookalike_map[customer] = [(agg_features['CustomerID'].iloc[i], round(score, 2)) for i, score in sorted_similarities]


In [11]:
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_map.keys(),
    'Lookalikes': [str(val) for val in lookalike_map.values()]
})
lookalike_df.to_csv('SDheeraj_Reddy_Lookalike.csv', index=False)