In [40]:
# import library
import pandas as pd
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity


1. Customer Representation as Vectors:

- We represent each customer as a vector, which includes various dimensions:
 - Average Order Value per Category: This reflects how much a customer spends on each category of products.
 - Number of Orders in Each Category: This indicates the frequency with which a customer buys products in a given category.
 - Days Taken for First Order: This represents how long it took a customer to make their first purchase after signing up.

The combination of these three attributes forms the customer features vector.

2. Customer-Category Matrix:

- We also construct a customer-category matrix, where each row corresponds to a customer and each column corresponds to a category of products (e.g., electronics, clothing, etc.).
- If a customer has bought a product from a particular category, we assign a value of 1; otherwise, it is 0.
- This matrix reflects which categories each customer has engaged with and provides an idea of the customer's product interests.

3. Cosine Similarity Calculation:

- To measure the similarity between customers, we use cosine similarity. This technique computes how similar two customers are by looking at the angle between their vectors in the feature space.
- Weighted Similarity: To combine the two sources of information—customer features and customer-category matrix, we assign weights to them:
 - 60% weight is given to the customer-category matrix, reflecting the customer's engagement with various product categories.
 - 40% weight is given to the customer features (average order value, number of orders per category, and days to first order), reflecting their purchasing behavior and timeline.
- By calculating cosine similarity for each customer with the given customer ID, we can find the most similar customers in terms of both purchasing behavior and product engagement.

In [2]:
product_df = pd.read_csv('/content/Products.csv')
customer_df = pd.read_csv('/content/Customers.csv')
transactions_df = pd.read_csv('/content/Transactions.csv')


In [3]:
#One-Hot Encoding for Region
region_dummies = pd.get_dummies(customer_df['Region'], prefix='Region')
customer_df = pd.concat([customer_df, region_dummies], axis=1)

In [4]:
transactions_with_product = pd.merge(transactions_df, product_df[['ProductID', 'Category']], on='ProductID', how='left')

In [5]:
transactions_with_product.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Electronics


In [6]:
customer_df['SignupDate'] = pd.to_datetime(customer_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

first_order_date = transactions_df.groupby('CustomerID')['TransactionDate'].min().reset_index()

customer_first_order = pd.merge(customer_df[['CustomerID', 'SignupDate']], first_order_date, on='CustomerID', how='left')

customer_first_order['DaysToFirstOrder'] = (customer_first_order['TransactionDate'] - customer_first_order['SignupDate']).dt.days

customer_df = pd.merge(customer_df, customer_first_order[['CustomerID', 'DaysToFirstOrder']], on='CustomerID', how='left')

In [23]:
category_quantity_total = transactions_with_product.groupby(['CustomerID', 'Category'])['Quantity'].sum().reset_index()

category_total_quantity = category_quantity_total.pivot(index='CustomerID', columns='Category', values='Quantity').fillna(0)


In [24]:
category_total_quantity.rename(columns={'Books': 'BooksQty','Clothing':'ClothingQty','Electronics':'ElectronicsQty','Home Decor':'HomeDecorQty'}, inplace=True)

In [25]:
category_total_quantity.head()

Category,BooksQty,ClothingQty,ElectronicsQty,HomeDecorQty
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,2.0,0.0,7.0,3.0
C0002,0.0,4.0,0.0,6.0
C0003,0.0,4.0,4.0,6.0
C0004,8.0,0.0,6.0,9.0
C0005,0.0,0.0,4.0,3.0


In [26]:
transactions_with_product = pd.merge(transactions_df, product_df[['ProductID', 'Category']], on='ProductID', how='left')

category_order_value = transactions_with_product.groupby(['CustomerID', 'Category'])['TotalValue'].mean().reset_index()

category_avg_order_value = category_order_value.pivot(index='CustomerID', columns='Category', values='TotalValue').fillna(0)


In [19]:
category_avg_order_value.head()

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,114.6,0.0,942.433333,412.62
C0002,0.0,512.73,0.0,418.64
C0003,0.0,122.36,1385.2,608.91
C0004,629.493333,0.0,677.87,703.553333
C0005,0.0,0.0,590.19,853.86


In [30]:
customer_features = customer_df.merge(category_avg_order_value, on='CustomerID', how='left')

customer_features = customer_features.merge(category_total_quantity, on='CustomerID', how='left')
customer_features = customer_features.merge(customer_df, on='CustomerID', how='left')


customer_features.head()


Unnamed: 0,CustomerID,CustomerName_x,Region_x,SignupDate_x,Region_Asia_x,Region_Europe_x,Region_North America_x,Region_South America_x,DaysToFirstOrder_x,Books,...,ElectronicsQty,HomeDecorQty,CustomerName_y,Region_y,SignupDate_y,Region_Asia_y,Region_Europe_y,Region_North America_y,Region_South America_y,DaysToFirstOrder_y
0,C0001,Lawrence Carroll,South America,2022-07-10,False,False,False,True,558.0,114.6,...,7.0,3.0,Lawrence Carroll,South America,2022-07-10,False,False,False,True,558.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,True,False,False,False,745.0,0.0,...,0.0,6.0,Elizabeth Lutz,Asia,2022-02-13,True,False,False,False,745.0
2,C0003,Michael Rivera,South America,2024-03-07,False,False,False,True,-18.0,0.0,...,4.0,6.0,Michael Rivera,South America,2024-03-07,False,False,False,True,-18.0
3,C0004,Kathleen Rodriguez,South America,2022-10-09,False,False,False,True,507.0,629.493333,...,6.0,9.0,Kathleen Rodriguez,South America,2022-10-09,False,False,False,True,507.0
4,C0005,Laura Weber,Asia,2022-08-15,True,False,False,False,578.0,0.0,...,4.0,3.0,Laura Weber,Asia,2022-08-15,True,False,False,False,578.0


In [32]:
customer_features = customer_features.drop(columns=['CustomerName_x', 'Region_x', 'SignupDate_x','CustomerName_y', 'Region_y',
       'SignupDate_y', 'Region_Asia_y', 'Region_Europe_y',
       'Region_North America_y', 'Region_South America_y',
       'DaysToFirstOrder_y'])

In [35]:
customer_features['Region_Asia_x'] = customer_features['Region_Asia_x'].astype('int')
customer_features['Region_South America_x'] = customer_features['Region_South America_x'].astype('int')
customer_features['Region_North America_x'] = customer_features['Region_North America_x'].astype('int')
customer_features['Region_Europe_x'] = customer_features['Region_Europe_x'].astype('int')


In [36]:
customer_features.head()

Unnamed: 0,CustomerID,Region_Asia_x,Region_Europe_x,Region_North America_x,Region_South America_x,DaysToFirstOrder_x,Books,Clothing,Electronics,Home Decor,BooksQty,ClothingQty,ElectronicsQty,HomeDecorQty
0,C0001,0,0,0,1,558.0,114.6,0.0,942.433333,412.62,2.0,0.0,7.0,3.0
1,C0002,1,0,0,0,745.0,0.0,512.73,0.0,418.64,0.0,4.0,0.0,6.0
2,C0003,0,0,0,1,-18.0,0.0,122.36,1385.2,608.91,0.0,4.0,4.0,6.0
3,C0004,0,0,0,1,507.0,629.493333,0.0,677.87,703.553333,8.0,0.0,6.0,9.0
4,C0005,1,0,0,0,578.0,0.0,0.0,590.19,853.86,0.0,0.0,4.0,3.0


In [39]:
transactions_with_category = transactions_df.merge(product_df[['ProductID', 'Category']], on='ProductID', how='left')

customer_product_category_df = transactions_with_category.merge(customer_df[['CustomerID']], on='CustomerID', how='left')

customer_category_matrix = customer_product_category_df.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)

customer_category_matrix = (customer_category_matrix > 0).astype(int)

print(customer_category_matrix.head())


Category    Books  Clothing  Electronics  Home Decor
CustomerID                                          
C0001           1         0            1           1
C0002           0         1            0           1
C0003           0         1            1           1
C0004           1         0            1           1
C0005           0         0            1           1


In [49]:
def get_top_3_similar_customers(customer_id, customer_features, customer_category_matrix):
    customer_features = customer_features.fillna(0)
    customer_category_matrix = customer_category_matrix.fillna(0)
    customer_features_vector = customer_features[customer_features['CustomerID'] == customer_id].drop('CustomerID', axis=1)
    feature_similarity = cosine_similarity(customer_features_vector, customer_features.drop('CustomerID', axis=1))
    feature_similarity_df = pd.DataFrame(feature_similarity.T, columns=['Similarity'], index=customer_features['CustomerID'])

    customer_category_vector = customer_category_matrix.loc[customer_id].values.reshape(1, -1)
    category_similarity = cosine_similarity(customer_category_vector, customer_category_matrix)
    category_similarity_df = pd.DataFrame(category_similarity.T, columns=['Similarity'], index=customer_category_matrix.index)

    combined_similarity_df = feature_similarity_df['Similarity'] * 0.4 + category_similarity_df['Similarity'] * 0.6

    combined_similarity_df = combined_similarity_df.drop(customer_id)
    top_3_similar_customers_combined = combined_similarity_df.nlargest(3)

    return [[customer, score] for customer, score in zip(top_3_similar_customers_combined.index, top_3_similar_customers_combined.values)]

# first 20 customers
customers = customer_df['CustomerID'][:20]

lookalike_data = []

for customer_id in customers:
    similar_customers = get_top_3_similar_customers(customer_id, customer_features, customer_category_matrix)
    lookalike_data.append({
        'CustomerID': customer_id,
        'SimilarCustomers': similar_customers
    })

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Anrutha_J_K_Lookalike.csv", index=False)

