In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_excel('online_retail.xlsx')
print("Data loaded successfully.")

Data loaded successfully.


In [3]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
# Data cleaning to remove negative quantities and zero prices
data = data[(data['Quantity'] > 0) & (data['UnitPrice'] > 0)]
data.drop_duplicates(subset=['Description', 'StockCode'], inplace=True)

print("Data preprocessing complete. Cleaned data shape:", data.shape)

Data preprocessing complete. Cleaned data shape: (4161, 8)


In [5]:
# Creating a pivot table for collaborative filtering
pivot_table = data.pivot_table(index='CustomerID', columns='StockCode', values='Quantity', fill_value=0, aggfunc='sum')

# Converting all column names to string to ensure compatibility with sklearn operations
pivot_table.columns = pivot_table.columns.map(str)

# Normalizing the data using StandardScaler
scaler = StandardScaler()
pivot_table_scaled = scaler.fit_transform(pivot_table)
pivot_table_scaled = pd.DataFrame(pivot_table_scaled, index=pivot_table.index, columns=pivot_table.columns)

print("Pivot table for collaborative filtering created and normalized. Preview:")
pivot_table_scaled.head()

Pivot table for collaborative filtering created and normalized. Preview:


StockCode,10002,10080,10120,10125,10133,15030,15034,15036,15039,16008,...,90212C,90214M,90214S,90214U,90214V,BANK CHARGES,C2,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,...,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199
12357.0,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,...,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199
12370.0,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,...,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199
12377.0,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,...,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199
12383.0,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,...,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199,-0.034199


In [6]:
# Splitting the data for content-based filtering
cleaned_data = data.dropna(subset=['Description', 'StockCode']).drop_duplicates(['StockCode'])
train_data, test_data = train_test_split(cleaned_data, test_size=0.2, random_state=42)

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix_train = tfidf.fit_transform(train_data['Description'])
tfidf_matrix_test = tfidf.transform(test_data['Description'])

print("TF-IDF vectorization complete. Train and test matrices ready.")

TF-IDF vectorization complete. Train and test matrices ready.


In [7]:
# Calculating cosine similarity for collaborative filtering
cosine_sim_collab = cosine_similarity(pivot_table_scaled)
cosine_sim_collab = pd.DataFrame(cosine_sim_collab, index=pivot_table.index, columns=pivot_table.index)

# Calculating cosine similarity for content-based filtering
cosine_sim_content = cosine_similarity(tfidf_matrix_train)
cosine_sim_content = pd.DataFrame(cosine_sim_content, index=train_data['StockCode'].unique(), columns=train_data['StockCode'].unique())

print("Similarity matrices for collaborative and content-based filtering computed.")

Similarity matrices for collaborative and content-based filtering computed.


In [8]:
def hybrid_recommendation(customer_id, stock_code, weight_collab=0.7, weight_content=0.3):
    if stock_code in cosine_sim_content.columns and customer_id in cosine_sim_collab.index:
        collab_scores = cosine_sim_collab.loc[customer_id]
        content_scores = cosine_sim_content.loc[stock_code]
        hybrid_scores = collab_scores * weight_collab + content_scores * weight_content

        # Retrieving top 10 recommendations, excluding the current product
        recommendations = hybrid_scores.sort_values(ascending=False).index[1:11]
        
        # Converting all recommendations to string type for output consistency
        recommendations = recommendations.astype(str).tolist()
        return recommendations
    else:
        return []  # Return an empty list if no data found

In [9]:
# Testing the recommendation system with an example
example_customer_id = pivot_table.index[10]  # Example customer ID for demonstration
example_stock_code = '85123A'  # Example stock code for demonstration

recommended_products = hybrid_recommendation(example_customer_id, example_stock_code)
if recommended_products:
    print(f"For the user who bought product '{example_stock_code}', we recommend the following products:")
    print(', '.join(recommended_products))
else:
    print(f"No recommendations could be generated for the user with Customer ID '{example_customer_id}' who bought product '{example_stock_code}'.")

For the user who bought product '85123A', we recommend the following products:
17096.0, 15036.0, 15034.0, 16236.0, 15039.0, 16033.0, 16218.0, 10002, 10080, 10120
