In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

  from pandas.core import (


Loading the dataset

In [2]:
customers = pd.read_csv('dataset/Customers.csv')
products = pd.read_csv('dataset/Products.csv')
transactions = pd.read_csv('dataset/Transactions.csv')

In [3]:
# Merge datasets for comprehensive data
processed_transactions = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

Aggregate transaction data for each customer

In [9]:
#Create customer profiles
customer_profiles = processed_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price_x': 'mean',
    'Category': lambda x: ','.join(x.astype(str).unique())
}).reset_index()

print(customer_profiles.head())

  CustomerID  TotalValue  Quantity     Price_x  \
0      C0001     3354.52        12  278.334000   
1      C0002     1862.74        10  208.920000   
2      C0003     2725.38        14  195.707500   
3      C0004     5354.88        23  240.636250   
4      C0005     2034.24         7  291.603333   

                          Category  
0     Books,Home Decor,Electronics  
1              Home Decor,Clothing  
2  Home Decor,Clothing,Electronics  
3     Books,Home Decor,Electronics  
4           Home Decor,Electronics  


In [10]:
# Select numerical features
numerical_features = ['TotalValue', 'Quantity', 'Price_x']

# Standardize the numerical features
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

print(customer_profiles.head())

  CustomerID  TotalValue  Quantity   Price_x                         Category
0      C0001   -0.061701 -0.122033  0.094670     Books,Home Decor,Electronics
1      C0002   -0.877744 -0.448000 -0.904016              Home Decor,Clothing
2      C0003   -0.405857  0.203934 -1.094109  Home Decor,Clothing,Electronics
3      C0004    1.032547  1.670787 -0.447702     Books,Home Decor,Electronics
4      C0005   -0.783929 -0.936951  0.285581           Home Decor,Electronics


In [11]:
# Calculate cosine similarity
cosine_sim = cosine_similarity(customer_profiles[numerical_features])

# Convert the similarity matrix to a DataFrame for better readability
cosine_sim_df = pd.DataFrame(cosine_sim, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])

print(cosine_sim_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.104513 -0.524923 -0.925208  0.909351  0.442395   
C0002       0.104513  1.000000  0.791531 -0.464035  0.506433 -0.844066   
C0003      -0.524923  0.791531  1.000000  0.172432 -0.124725 -0.994780   
C0004      -0.925208 -0.464035  0.172432  1.000000 -0.990272 -0.083333   
C0005       0.909351  0.506433 -0.124725 -0.990272  1.000000  0.029596   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.957854 -0.980620  0.885035 -0.268370  ...  0.953552  0.875392   
C0002      -0.126391 -0.208586  0.552510  0.929885  ...  0.366172  0.561020   
C0003      -0.694381  0.426063 -0.070251  0.960431  ... -0.270712 -0.056387   
C0004      -0.786871  0.960972 -0.985116 -0.108724  ... -0.969254 -0.975266   
C0005  

Top 3 similar customers for first 20 customers 

In [15]:
# Extract the top 3 similar customers with their similarity scores for the first 20 customers
top_3_lookalikes_with_scores = {}
for customer_id in customer_profiles['CustomerID'][:20]:
    similar_customers = cosine_sim_df.loc[customer_id].nlargest(4).iloc[1:]
    top_3_lookalikes_with_scores[customer_id] = list(similar_customers.items())


lookalike_with_scores_df = pd.DataFrame({
    'CustomerID': list(top_3_lookalikes_with_scores.keys()),
    'SimilarCustomersWithScores': [str(v) for v in top_3_lookalikes_with_scores.values()]
})

# Save the DataFrame to a CSV file
lookalike_with_scores_df.to_csv('Lookalike_with_scores.csv', index=False)

lookalike_with_scores_df.head(20)

Unnamed: 0,CustomerID,SimilarCustomersWithScores
0,C0001,"[('C0103', 0.9975729385618538), ('C0092', 0.99..."
1,C0002,"[('C0029', 0.9998543931340029), ('C0077', 0.99..."
2,C0003,"[('C0111', 0.9984874468302141), ('C0190', 0.99..."
3,C0004,"[('C0165', 0.9983897071764074), ('C0162', 0.99..."
4,C0005,"[('C0167', 0.9999721868436701), ('C0020', 0.99..."
5,C0006,"[('C0168', 0.9976122332196319), ('C0196', 0.99..."
6,C0007,"[('C0125', 0.9998486580402707), ('C0089', 0.99..."
7,C0008,"[('C0084', 0.9960866913262758), ('C0113', 0.99..."
8,C0009,"[('C0130', 0.9999651017117013), ('C0128', 0.99..."
9,C0010,"[('C0176', 0.9994511608148322), ('C0055', 0.99..."


Example Recommendation
for Customer based on what other customers buy and their similarity scores

In [16]:
def recommend_products(customer_id, top_n=5):
    # Get the top similar customers
    similar_customers = cosine_sim_df[customer_id].nlargest(top_n + 1).iloc[1:].index
    
    # Get the products purchased by the similar customers
    similar_customers_transactions = processed_transactions[processed_transactions['CustomerID'].isin(similar_customers)]
    
    # Get the products purchased by the target customer
    customer_transactions = processed_transactions[processed_transactions['CustomerID'] == customer_id]
    
    # Recommend products that similar customers have purchased but the target customer has not
    recommended_products = similar_customers_transactions[~similar_customers_transactions['ProductID'].isin(customer_transactions['ProductID'])]['ProductName'].unique()
    
    return recommended_products

# Example recommendations for customer 'C0020'
recommendations = recommend_products('C0020')
print(recommendations)

['ComfortLiving Bluetooth Speaker' 'TechPro Running Shoes'
 'ActiveWear Cookware Set' 'HomeSense T-Shirt' 'HomeSense Desk Lamp'
 'ComfortLiving Cookware Set' 'ActiveWear Smartwatch' 'TechPro Smartwatch'
 'SoundWave Jeans' 'TechPro Cookbook' 'ComfortLiving Headphones']
