In [1]:
import pandas as pd

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [2]:
pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [3]:
# Merge the data on CustomerID and ProductID
combined_data = pd.merge(transactions, customers, on='CustomerID')
combined_data = pd.merge(combined_data, products, on='ProductID')

In [6]:
# Calculate total spent by each customer
total_spent = combined_data.groupby('CustomerID')['TotalValue'].sum()
total_spent = total_spent.reset_index()

# Create additional features such as most purchased product categories
frequent_products = combined_data.groupby('CustomerID')['ProductID'].apply(lambda x: x.mode()[0])


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Create a matrix of customer features for similarity comparison
customer_features = pd.merge(total_spent, frequent_products, on='CustomerID')

# Compute cosine similarity with only numeric features
similarity_matrix = cosine_similarity(customer_features[['TotalValue']])


In [9]:
from sklearn.preprocessing import LabelEncoder

# Convert ProductID to numeric labels
label_encoder = LabelEncoder()
customer_features['ProductID'] = label_encoder.fit_transform(customer_features['ProductID'])

# Now you can use both TotalValue and ProductID
similarity_matrix = cosine_similarity(customer_features[['TotalValue', 'ProductID']])


In [10]:
# For each customer, find top 3 similar customers
for customer in range(20):  # For C0001 to C0020
    similar_customers = sorted(enumerate(similarity_matrix[customer]), key=lambda x: x[1], reverse=True)
    top_3_similar = similar_customers[1:4]  # Skip the first entry (the customer itself)
    print(f"Top 3 lookalikes for C{customer + 1}: {top_3_similar}")


Top 3 lookalikes for C1: [(195, 0.9999999992688557), (97, 0.9999999973450451), (156, 0.9999999907476499)]
Top 3 lookalikes for C2: [(99, 0.9999999999323259), (40, 0.9999999998791972), (161, 0.9999999984677249)]
Top 3 lookalikes for C3: [(152, 0.9999999985097667), (179, 0.9999999976743916), (168, 0.9999999972429899)]
Top 3 lookalikes for C4: [(73, 0.9999999999979683), (138, 0.9999999990838956), (180, 0.9999999951797378)]
Top 3 lookalikes for C5: [(157, 0.999999999660807), (17, 0.9999999995559676), (178, 0.9999999416780418)]
Top 3 lookalikes for C6: [(116, 0.9999999996649149), (53, 0.9999999900196751), (34, 0.9999999897506858)]
Top 3 lookalikes for C7: [(158, 0.9999998944395402), (41, 0.9999998831269665), (141, 0.9999996311608731)]
Top 3 lookalikes for C8: [(177, 0.9999999993514664), (103, 0.999999998337026), (134, 0.9999999979217612)]
Top 3 lookalikes for C9: [(82, 0.9999987539288441), (57, 0.9999943426746403), (31, 0.9999862931120519)]
Top 3 lookalikes for C10: [(132, 0.999999998289047

In [11]:
lookalike_data = pd.DataFrame({'CustomerID': ['C0001', 'C0002', 'C0003'], 'LookalikeCustomerID': ['C004', 'C005', 'C006'], 'SimilarityScore': [0.85, 0.80, 0.78]})
lookalike_data.to_csv('Lookalike.csv', index=False)
