In [45]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler 

In [27]:
data = pd.read_excel('online_retail.xlsx')
print("Data loaded successfully.")

Data loaded successfully.


In [28]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [29]:
data = data[(data['Quantity'] > 0) & (data['UnitPrice'] > 0)]
data.drop_duplicates(subset=['Description', 'StockCode'], inplace=True)

print("Data preprocessing complete. Data shape:", data.shape)

Data preprocessing complete. Data shape: (4161, 8)


In [33]:
pivot_table = data.pivot_table(index='CustomerID', columns='StockCode', values='Quantity', fill_value=0, aggfunc='sum')

pivot_table.columns = pivot_table.columns.astype(str)

scaler = StandardScaler()
pivot_table_scaled = scaler.fit_transform(pivot_table)
pivot_table_scaled = pd.DataFrame(pivot_table_scaled, index=pivot_table.index, columns=pivot_table.columns)

print("Pivot table created and data normalized for collaborative filtering.")
pivot_table_scaled.head()

Pivot table created and data normalized for collaborative filtering.


StockCode,10002,10080,10120,10125,10133,15030,15034,15036,15039,16008,...,90212C,90214M,90214S,90214U,90214V,BANK CHARGES,C2,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,...,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199
12357.0,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,...,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199
12370.0,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,...,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199
12377.0,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,...,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199
12383.0,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,...,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199


In [34]:
cosine_sim_collab = cosine_similarity(pivot_table_scaled)
cosine_sim_collab = pd.DataFrame(cosine_sim_collab, index=pivot_table.index, columns=pivot_table.index)

print("Cosine similarity matrix for collaborative filtering computed. Preview:")
cosine_sim_collab.head()

Cosine similarity matrix for collaborative filtering computed. Preview:


CustomerID,12346.0,12357.0,12370.0,12377.0,12383.0,12388.0,12395.0,12399.0,12406.0,12412.0,...,18172.0,18177.0,18196.0,18219.0,18221.0,18229.0,18239.0,18245.0,18252.0,18257.0
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,1.0,0.001522,-0.000469,0.001148,0.001522,0.001522,0.001522,0.001522,0.001302,0.001522,...,0.000252,0.001522,0.000252,0.00029,0.001522,0.001522,0.001522,0.000252,0.001522,0.001522
12357.0,0.001522,1.0,-0.000469,0.001148,0.001522,0.001522,0.001522,0.001522,0.001302,0.001522,...,0.000252,0.001522,0.000252,0.00029,0.001522,0.001522,0.001522,0.000252,0.001522,0.001522
12370.0,-0.000469,-0.000469,1.0,-0.000656,-0.000469,-0.000469,-0.000469,-0.000469,-0.000671,-0.000469,...,-0.000809,-0.000469,-0.000809,-0.000935,-0.000469,-0.000469,-0.000469,-0.000809,-0.000469,-0.000469
12377.0,0.001148,0.001148,-0.000656,1.0,0.001148,0.001148,0.001148,0.001148,0.000913,0.001148,...,5e-06,0.001148,5e-06,5e-06,0.001148,0.001148,0.001148,5e-06,0.001148,0.001148
12383.0,0.001522,0.001522,-0.000469,0.001148,1.0,0.001522,0.001522,0.001522,0.001302,0.001522,...,0.000252,0.001522,0.000252,0.00029,0.001522,0.001522,0.001522,0.000252,0.001522,0.001522


In [39]:
# Use TF-IDF to convert product descriptions into a matrix of TF-IDF features for content-based filtering
tfidf = TfidfVectorizer(stop_words='english')
# Ensure data consistency by dropping any rows where Description is NaN
consistent_data = data.dropna(subset=['Description', 'StockCode']).drop_duplicates(['StockCode'])

# Generate TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(consistent_data['Description'])

# Compute cosine similarity based on product features
cosine_sim_content = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Create a DataFrame for the cosine similarity matrix with proper indices and columns based on StockCode
cosine_sim_content = pd.DataFrame(cosine_sim_content, index=consistent_data['StockCode'].unique(), columns=consistent_data['StockCode'].unique())

print("Cosine similarity matrix for content-based filtering computed.")
cosine_sim_content.head()

Cosine similarity matrix for content-based filtering computed.


Unnamed: 0,85123A,71053,84406B,84029G,84029E,22752,21730,22633,22632,84879,...,23560,23576,23562,23561,23609,85179a,23617,90214U,47591b,23843
85123A,1.0,0.205753,0.0,0.0,0.238734,0.0,0.348252,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.175496,0.0,0.0,0.0,0.0
71053,0.205753,1.0,0.0,0.0,0.14459,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84406B,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84029G,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.187952,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84029E,0.238734,0.14459,0.0,0.0,1.0,0.0,0.0,0.0,0.075402,0.0,...,0.0,0.088307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# Define a function to compute hybrid product recommendations combining collaborative and content-based methods
def hybrid_recommendation(customer_id, stock_code, weight_collab=0.7, weight_content=0.3):
    if stock_code in cosine_sim_content.columns and customer_id in cosine_sim_collab.index:
        # Calculate combined scores using weighted averages of both similarity matrices
        collab_scores = cosine_sim_collab.loc[customer_id].sort_values(ascending=False)
        content_scores = cosine_sim_content.loc[stock_code].sort_values(ascending=False)
        hybrid_scores = (collab_scores * weight_collab) + (content_scores * weight_content)

        # Retrieve top 10 recommendations, excluding the current product
        recommended_products = hybrid_scores.sort_values(ascending=False).index[1:11]
        return recommended_products
    else:
        return "Product code or customer ID not found in the dataset."

print("Hybrid recommendation function defined successfully.")

Hybrid recommendation function defined successfully.


In [43]:
# Demonstrate the recommendation system using an example customer ID and product stock code
example_customer_id = pivot_table.index[10]  # Example customer ID
example_stock_code = '85123A'  # Example stock code

recommended_products = hybrid_recommendation(example_customer_id, example_stock_code)
print(f"Products recommended for customer '{example_customer_id}' interested in product '{example_stock_code}'are {recommended_products}")

Products recommended for customer '12415.0' interested in product '85123A'are Index([16016.0, 17096.0, 15036.0, 15034.0, 16011.0, 16236.0, 15039.0, 16033.0,
       16218.0,   10002],
      dtype='object')
