In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

print("Missing values in customer data:")
print(customers.isnull().sum())

print("\nMissing values in transaction data:")
print(transactions.isnull().sum())

agg_transactions = transactions.groupby('CustomerID').agg(
    total_spent=pd.NamedAgg(column='TotalValue', aggfunc='sum'),
    total_quantity=pd.NamedAgg(column='Quantity', aggfunc='sum'),
    transaction_count=pd.NamedAgg(column='TransactionID', aggfunc='nunique'),
    avg_price=pd.NamedAgg(column='Price', aggfunc='mean')
).reset_index()

customer_data = pd.merge(customers, agg_transactions, on='CustomerID', how='left')

customer_data['SignupDate'] = pd.to_datetime(customer_data['SignupDate'])
customer_data['AccountAge'] = (pd.to_datetime('today') - customer_data['SignupDate']).dt.days

customer_data.fillna({'total_spent': 0, 'total_quantity': 0, 'transaction_count': 0, 'avg_price': 0}, inplace=True)

customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)

customer_features = customer_data[['total_spent', 'total_quantity', 'transaction_count', 'avg_price', 'AccountAge'] + [col for col in customer_data.columns if col.startswith('Region_')]]

scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features)

print("\nMissing values after filling and encoding:")
print(pd.DataFrame(normalized_features).isnull().sum())

cosine_sim = cosine_similarity(normalized_features)

cosine_sim_df = pd.DataFrame(cosine_sim, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

def get_top_n_lookalikes(customer_id, n=3):
    similarities = cosine_sim_df[customer_id].drop(customer_id)
    
    top_similar_customers = similarities.sort_values(ascending=False).head(n)
    
    return [(customer, score) for customer, score in top_similar_customers.items()]

lookalike_map = {}
for cust_id in customer_data['CustomerID'][:20]:  
    lookalike_map[cust_id] = get_top_n_lookalikes(cust_id)
lookalike_df = []
for cust_id, lookalikes in lookalike_map.items():
    for lookalike, score in lookalikes:
        lookalike_df.append([cust_id, lookalike, score])

lookalike_df = pd.DataFrame(lookalike_df, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

lookalike_df.to_csv('Lookalike.csv', index=False)

print("\nTop lookalike recommendations for customers C0001 to C0020:")
print(lookalike_df.head())


Missing values in customer data:
CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64

Missing values in transaction data:
TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64

Missing values after filling and encoding:
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
dtype: int64

Top lookalike recommendations for customers C0001 to C0020:
  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0152         0.983326
1      C0001               C0011         0.962587
2      C0001               C0118         0.944343
3      C0002               C0027         0.944131
4      C0002               C0159         0.918818
