In [57]:
# Import required libraries
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [58]:
#Loadd the product catalog
product_catalog_df =pd.read_csv ("C:/data/Work/Data_processing/Feature_Engineering/Feature_product_catalog.csv")

In [59]:
# Load the customer survey dataset
customer_survey_df = pd.read_csv("C:/data/Work/Data_processing/Feature_Engineering/Feature_customer_survey.csv")

In [60]:
# Display basic information about the dataset
product_info = {
    "Shape": product_catalog_df.shape,
    "Columns": list(product_catalog_df.columns),
    "Missing Values": product_catalog_df.isnull().sum().to_dict(),
    "Sample Data": product_catalog_df.head()}
print(product_info)

{'Shape': (100, 25), 'Columns': ['Product ID', 'Product Name', 'Category', 'Material', 'Size', 'Special_features', 'Customer Type', 'Price', 'Payment Type', 'Sellable Online', 'Sales Volume', 'Return Rate', 'Storage Cost', 'Seasonality Score', 'Implicit Feedback', 'Sales_to_Return_Ratio', 'Revenue_Per_Product', 'Online_Sellability_Score', 'Storage_Efficiency_Score', 'High_Demand_Indicator', 'Implicit_Feedback_Score', 'Top_Rated', 'Likelihood_of_Purchase', 'Customer_Interest_Score', 'Expert_Judgment_Score'], 'Missing Values': {'Product ID': 0, 'Product Name': 0, 'Category': 0, 'Material': 0, 'Size': 0, 'Special_features': 0, 'Customer Type': 0, 'Price': 0, 'Payment Type': 0, 'Sellable Online': 0, 'Sales Volume': 0, 'Return Rate': 0, 'Storage Cost': 0, 'Seasonality Score': 0, 'Implicit Feedback': 0, 'Sales_to_Return_Ratio': 0, 'Revenue_Per_Product': 0, 'Online_Sellability_Score': 0, 'Storage_Efficiency_Score': 0, 'High_Demand_Indicator': 0, 'Implicit_Feedback_Score': 0, 'Top_Rated': 0, '

In [61]:
# Select relevant columns for recommendations
selected_features = [
    "Product Name", "Category", "Material", "Size", "Special_features", 
    "Sales Volume", "Likelihood_of_Purchase", "Revenue_Per_Product", "Expert_Judgment_Score"
]

# Drop missing values
product_catalog_df = product_catalog_df.dropna(subset=selected_features)

# Normalize business-related numerical values using MinMaxScaler
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(product_catalog_df[["Sales Volume", "Likelihood_of_Purchase", "Revenue_Per_Product", "Expert_Judgment_Score"]])

# Add normalized values back to dataframe
product_catalog_df[["Sales_Score", "Purchase_Score", "Revenue_Score", "Expert_Score"]] = scaled_values

# Function to create feature strings with structured business metrics
def create_feature_string(row):
    return (
        f"{row['Category']} " * 4 + 
        f"{row['Material']} " * 3 +  
        f"{row['Special_features']} " * 2 +  
        f"{row['Size']} " * 1 +
        f"sales_{round(row['Sales_Score'], 2)} " + 
        f"purchase_{round(row['Purchase_Score'], 2)} " + 
        f"revenue_{round(row['Revenue_Score'], 2)} " + 
        f"expert_{round(row['Expert_Score'], 2)}"
    ).strip()

# Apply the function
product_catalog_df["Combined_Features"] = product_catalog_df.apply(create_feature_string, axis=1)

# Display sample feature combinations
print(product_catalog_df[["Product Name", "Combined_Features"]].head(10))

            Product Name                                  Combined_Features
0   Kitchen Pantry Shelf  Kitchen & Dining Kitchen & Dining Kitchen & Di...
1  Ottoman Storage Bench  Storage & Organization Storage & Organization ...
2       Rustic Bookshelf  Storage & Organization Storage & Organization ...
3        Rattan Armchair  Sofas & Seating Sofas & Seating Sofas & Seatin...
4     Glass Dining Table  Tables Tables Tables Tables Glass Glass Glass ...
5   Upholstered armchair  Sofas & Seating Sofas & Seating Sofas & Seatin...
6      Wooden Side Table  Tables Tables Tables Tables Wood Wood Wood Wat...
7      Smart Office Desk  Tables Tables Tables Tables Wood Wood Wood Dur...
8          Bistro Tables  Tables Tables Tables Tables Unknown Unknown Un...
9  Open Closet Organizer  Bedroom Furniture Bedroom Furniture Bedroom Fu...


In [62]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words="english", max_features=1000, ngram_range=(1,2))

# Convert Combined_Features column into a TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(product_catalog_df["Combined_Features"])

# Display matrix shape
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

TF-IDF Matrix Shape: (100, 536)


In [63]:
# Compute cosine similarity between product TF-IDF vectors
cosine_sim = cosine_similarity(tfidf_matrix)

# Display shape of similarity matrix
print("Cosine Similarity Matrix Shape:", cosine_sim.shape)

# Show a sample of the similarity matrix
print("Sample Similarity Scores:\n", np.round(cosine_sim[:5, :5], 2))


Cosine Similarity Matrix Shape: (100, 100)
Sample Similarity Scores:
 [[1.   0.01 0.11 0.07 0.05]
 [0.01 1.   0.57 0.01 0.02]
 [0.11 0.57 1.   0.05 0.08]
 [0.07 0.01 0.05 1.   0.04]
 [0.05 0.02 0.08 0.04 1.  ]]


In [None]:
#Content based recommendation system
def recommend_products(product_name, product_catalog_df, cosine_sim, top_n=5, min_similarity=0.05):
    print(f"\nSearching for recommendations for: {product_name}")
    
    # Find index of the product in the catalog
    product_indices = product_catalog_df[product_catalog_df["Product Name"] == product_name].index

    if product_indices.empty:
        print("Product not found in the dataset.")
        return [f"Product '{product_name}' not found in the catalog."]

    product_index = product_indices[0]
    similarity_scores = list(enumerate(cosine_sim[product_index]))

    # Sort by similarity in descending order
    sorted_products = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    recommendations = []
    seen_products = set()

    for idx, score in sorted_products:
        rec_product = product_catalog_df.iloc[idx]["Product Name"]

        # Skip the same product and low scores
        if rec_product == product_name or score < min_similarity:
            continue

        if rec_product not in seen_products:
            print(f"Checking Product: {rec_product} - Score: {score:.2f}")
            recommendations.append(rec_product)
            seen_products.add(rec_product)

        if len(recommendations) == top_n:
            break

    if not recommendations:
        print("No strong matches found.")
        return ["No strong matches found. Try another product."]
    
    return recommendations







In [65]:
# Use Case to see whether the system works
# Finding similar products for a given product
product_to_search = "Rustic Bookshelf"  
recommendations = recommend_products(product_to_search, product_catalog_df, cosine_sim)

# Display recommendations
print(f"\nRecommended Products for '{product_to_search}':")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")


Searching for recommendations for: Rustic Bookshelf
Checking Product: Modern TV Cabinet - Score: 0.69
Checking Product: Sliding Door Wardrobe - Score: 0.69
Checking Product: Ottoman Storage Bench - Score: 0.60
Checking Product: Compact Shoe Rack - Score: 0.58
Checking Product: Foldable Sun Lounger - Score: 0.22

Recommended Products for 'Rustic Bookshelf':
1. Modern TV Cabinet
2. Sliding Door Wardrobe
3. Ottoman Storage Bench
4. Compact Shoe Rack
5. Foldable Sun Lounger


In [66]:
#Collaborative Filtering
#Renaming "Suggested Products" to "Product Name" in the customer survey data

if "Suggested Products" in customer_survey_df.columns:
    customer_survey_df.rename(columns={"Suggested Products": "Product Name"}, inplace=True)

# Convert product lists stored as strings to actual lists
customer_survey_df["Product Name"] = customer_survey_df["Product Name"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else [x]
)

# Expand product list so each product gets its own row
customer_survey_df = customer_survey_df.explode("Product Name")

# Clean product names (lowercase and remove unwanted spaces)
customer_survey_df["Product Name"] = customer_survey_df["Product Name"].str.lower().str.strip()
product_catalog_df["Product Name"] = product_catalog_df["Product Name"].str.lower().str.strip()

# Remove unmatched products
customer_survey_df = customer_survey_df[customer_survey_df["Product Name"].isin(product_catalog_df["Product Name"])]

# Keep only needed columns
survey_cols = ["Customer ID", "Product Name", "Recommendation Score", "Purchase Frequency", "Review_Sentiment", "Engagement_Sentiment"]
customer_survey_df = customer_survey_df[survey_cols]

catalog_cols = ["Product Name", "Implicit_Feedback_Score"]
product_catalog_df = product_catalog_df[catalog_cols]

# Merge both datasets
merged_df = pd.merge(customer_survey_df, product_catalog_df, on="Product Name", how="inner")

# Fill missing values
merged_df.fillna(0, inplace=True)


In [69]:
print(merged_df.head())


   Customer ID      Product Name  Recommendation Score Purchase Frequency  \
0            1  rustic bookshelf             -0.173299            monthly   
1            1  rustic bookshelf             -0.173299            monthly   
2            1  rustic bookshelf             -0.173299            monthly   
3            1  rustic bookshelf             -0.173299            monthly   
4            1  rustic bookshelf             -0.173299            monthly   

   Review_Sentiment  Engagement_Sentiment  Implicit_Feedback_Score  
0                 1                     1                        0  
1                 1                     1                        1  
2                 1                     1                       -1  
3                 1                     1                        0  
4                 1                     1                        0  


In [70]:
# Create Final Score (Combining Recommendation Score & Implicit Feedback Score)
merged_df["Final_Score"] = merged_df["Recommendation Score"].combine_first(merged_df["Implicit_Feedback_Score"])

# Create User-Product Interaction Matrix
interaction_matrix = merged_df.pivot_table(index="Customer ID",  # Rows = Customers
                                           columns="Product Name",  # Columns = Products
                                           values="Final_Score",  # Values = Scores
                                           aggfunc="mean")  # Average in case of duplicates

# Fill missing values 
interaction_matrix.fillna(0, inplace=True)

#  Display final interaction matrix
print("User-Product Interaction Matrix:")
print(interaction_matrix.head())

User-Product Interaction Matrix:
Product Name  6-piece outdoor dining set  adjustable standing desk  \
Customer ID                                                          
1                                    0.0                 -0.173299   
2                                    0.0                  0.000000   
3                                    0.0                  0.000000   
4                                    0.0                  0.000000   
5                                    0.0                 -0.699509   

Product Name  chaise sofas  compact shoe rack  foldable sun lounger  \
Customer ID                                                           
1                  0.00000                0.0                   0.0   
2                  0.00000                0.0                   0.0   
3                  0.00000                0.0                   0.0   
4                  0.24767                0.0                   0.0   
5                  0.00000                0.0     

In [71]:
# Count how many unique customers interacted with each product
product_interaction_counts = merged_df.groupby("Product Name")["Customer ID"].nunique().sort_values(ascending=False)

# Display top 10 most interacted products
print(" Top 10 Most Interacted Products:")
print(product_interaction_counts.head(10))

# Display bottom 10 least interacted products
print("\n Least Interacted Products:")
print(product_interaction_counts.tail(10))


 Top 10 Most Interacted Products:
Product Name
adjustable standing desk      19
smart office desk             15
chaise sofas                  13
padded dining chair           11
rustic bookshelf              11
foldable sun lounger          10
compact shoe rack              9
6-piece outdoor dining set     9
upholstered armchair           9
outdoor patio chair            9
Name: Customer ID, dtype: int64

 Least Interacted Products:
Product Name
compact shoe rack             9
6-piece outdoor dining set    9
upholstered armchair          9
outdoor patio chair           9
luxury bean bag               9
sliding door wardrobe         8
ottoman storage bench         6
rattan armchair               5
minimalist sectional sofa     4
queen bed with storage        4
Name: Customer ID, dtype: int64


In [72]:
# Find customers who bought only one product
single_product_customers = customer_interaction_counts[customer_interaction_counts == 1]

print(f" Number of Customers Who Bought Only 1 Product: {len(single_product_customers)}")
print("\n Customers Who Bought Only One Product:")
print(single_product_customers.index.tolist())


 Number of Customers Who Bought Only 1 Product: 31

 Customers Who Bought Only One Product:
[20, 22, 18, 15, 14, 6, 8, 2, 4, 23, 57, 40, 31, 37, 48, 62, 58, 74, 78, 73, 72, 66, 76, 79, 91, 92, 84, 81, 82, 85, 95]


In [76]:
# Creating a binary version of the interaction matrix (1 = interacted, 0 = not interacted)
binary_matrix = (interaction_matrix > 0).astype(int)

# Filter out users who haven’t interacted with any products
binary_matrix = binary_matrix[binary_matrix.sum(axis=1) > 0]

# Ouput
print("Binary User-Product Matrix Sample:")
print(binary_matrix.head(10))


Binary User-Product Matrix Sample:
Product Name  6-piece outdoor dining set  adjustable standing desk  \
Customer ID                                                          
2                                      0                         0   
4                                      0                         0   
7                                      0                         1   
8                                      0                         0   
9                                      0                         0   
10                                     1                         0   
12                                     0                         1   
13                                     0                         0   
15                                     0                         0   
18                                     0                         0   

Product Name  chaise sofas  compact shoe rack  foldable sun lounger  \
Customer ID                                          

In [77]:
# Transpose interaction matrix to get Products as rows and Customers as columns
item_user_matrix = interaction_matrix.T

# Calculate cosine similarity between products
item_similarity = cosine_similarity(item_user_matrix)

# Store it in a DataFrame for easier lookup
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=item_user_matrix.index,
    columns=item_user_matrix.index
)

# Output
print("Item-Item Similarity Matrix (Sample):")
print(item_similarity_df.iloc[:5, :5])



Item-Item Similarity Matrix (Sample):
Product Name                6-piece outdoor dining set  \
Product Name                                             
6-piece outdoor dining set                    1.000000   
adjustable standing desk                      0.000000   
chaise sofas                                  0.000000   
compact shoe rack                             0.338423   
foldable sun lounger                          0.000000   

Product Name                adjustable standing desk  chaise sofas  \
Product Name                                                         
6-piece outdoor dining set                       0.0           0.0   
adjustable standing desk                         1.0           0.0   
chaise sofas                                     0.0           1.0   
compact shoe rack                                0.0           0.0   
foldable sun lounger                             0.0           0.0   

Product Name                compact shoe rack  foldable sun loun

In [79]:
def recommend_similar_items(product_name, top_n=5):
    if product_name not in item_similarity_df.index:
        print(f"Product '{product_name}' not found in the data.")
        return []

    # Sort the similarity scores and skip the product itself
    similar_scores = item_similarity_df[product_name].sort_values(ascending=False)
    top_matches = similar_scores.iloc[1:top_n + 1]

    print(f"\nProducts similar to '{product_name}':")
    for item, score in top_matches.items():
        print(f"- {item} (Score: {score:.2f})")

    return top_matches.index.tolist()

# Example 
recommend_similar_items("rustic bookshelf")



Products similar to 'rustic bookshelf':
- padded dining chair (Score: 0.41)
- adjustable standing desk (Score: 0.23)
- smart office desk (Score: 0.13)
- foldable sun lounger (Score: 0.00)
- 6-piece outdoor dining set (Score: 0.00)


['padded dining chair',
 'adjustable standing desk',
 'smart office desk',
 'foldable sun lounger',
 '6-piece outdoor dining set']