In [20]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')


In [10]:
data = transactions.merge(customers, on='CustomerID', how='left')
data = data.merge(products, on='ProductID', how='left')

In [23]:
data = transactions.merge(customers, on='CustomerID', how='left')
data = data.merge(products, on='ProductID', how='left')


customer_features = data.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    favorite_category=('Category', lambda x: x.mode()[0]),
    region=('Region', 'first')
).reset_index()

In [24]:
customer_features = pd.get_dummies(customer_features, columns=['favorite_category', 'region'], prefix=['category', 'region'])

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])  

In [25]:
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

lookalikes = {}
for customer in similarity_df.index:
    similar_customers = similarity_df.loc[customer].drop(customer).nlargest(3)
    lookalikes[customer] = list(zip(similar_customers.index, similar_customers.values))

In [26]:
lookalike_results = []
for customer, similar_list in lookalikes.items():
    for similar_customer, score in similar_list:
        lookalike_results.append({
            'CustomerID': customer,
            'LookalikeID': similar_customer,
            'SimilarityScore': score
        })

lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike results saved to Lookalike.csv")


Lookalike results saved to Lookalike.csv


In [27]:
similar_pairs = []
for customer, similar_list in lookalikes.items():
    for similar_customer, score in similar_list:
        details_1 = customer_features[customer_features['CustomerID'] == customer]
        details_2 = customer_features[customer_features['CustomerID'] == similar_customer]
        pair_details = {
            'CustomerID_1': customer,
            'CustomerID_2': similar_customer,
            'SimilarityScore': score,
            'Details_1': details_1.iloc[0].to_dict(),
            'Details_2': details_2.iloc[0].to_dict()
        }
        similar_pairs.append(pair_details)

In [30]:
customer_id_input = input("Enter CustomerID to find lookalikes: ")
if customer_id_input not in similarity_df.index:
    print("CustomerID not found in the dataset!")
else:
    similar_customers = similarity_df.loc[customer_id_input].drop(customer_id_input).nlargest(3)
    print(f"Top 3 Lookalikes for Customer {customer_id_input}:")
    for idx, (cust_id, score) in enumerate(similar_customers.items(), start=1):
        print(f"{idx}. CustomerID: {cust_id}, Similarity Score: {score:.3f}")


Enter CustomerID to find lookalikes: C0019
Top 3 Lookalikes for Customer C0019:
1. CustomerID: C0121, Similarity Score: 0.939
2. CustomerID: C0081, Similarity Score: 0.920
3. CustomerID: C0132, Similarity Score: 0.831
