In [5]:
import pandas as pd


In [7]:
df = pd.read_excel('Online Retail.xlsx', parse_dates=["InvoiceDate"])
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


## Data Pre-processing

In [9]:
# drop rows with missing values for CustomerID
df = df.dropna(subset=['CustomerID'])

# convert CustomerID to integer
df['CustomerID'] = df['CustomerID'].astype(int)

# Remove rows with negative quantity
df = df[df['Quantity'] > 0]

# Remove rows with negative unit price
df = df[df['UnitPrice'] > 0]

# Aggregate data per customer-item pair
customer_item_matrix = df.groupby(['CustomerID', 'StockCode'])['Quantity'].sum().unstack().fillna(0)

# display the matrix
customer_item_matrix.head()

# fill in missing values for price with median value
df['UnitPrice'].fillna(df['UnitPrice'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['UnitPrice'].fillna(df['UnitPrice'].median(), inplace=True)


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute customer-based similarity
customer_sim = cosine_similarity(customer_item_matrix)
customer_sim_df = pd.DataFrame(customer_sim, index=customer_item_matrix.index, columns=customer_item_matrix.index)

# Compute item-based similarity
item_sim = cosine_similarity(customer_item_matrix.T)
item_sim_df = pd.DataFrame(item_sim, index=customer_item_matrix.columns, columns=customer_item_matrix.columns)

# Display similarity matrices
print("Customer Similarity Matrix (Sample):")
print(customer_sim_df.iloc[:5, :5])

print("\nItem Similarity Matrix (Sample):")
print(item_sim_df.iloc[:5, :5])


Customer Similarity Matrix (Sample):
CustomerID  12346     12347     12348     12349     12350
CustomerID                                               
12346         1.0  0.000000  0.000000  0.000000  0.000000
12347         0.0  1.000000  0.148879  0.020750  0.014435
12348         0.0  0.148879  1.000000  0.000169  0.000315
12349         0.0  0.020750  0.000169  1.000000  0.030121
12350         0.0  0.014435  0.000315  0.030121  1.000000

Item Similarity Matrix (Sample):
StockCode     10002     10080     10120     10125     10133
StockCode                                                  
10002      1.000000  0.000000  0.001548  0.853890  0.052085
10080      0.000000  1.000000  0.000000  0.004958  0.020646
10120      0.001548  0.000000  1.000000  0.001600  0.042543
10125      0.853890  0.004958  0.001600  1.000000  0.011629
10133      0.052085  0.020646  0.042543  0.011629  1.000000


# Building Recommender System

In [13]:
def get_hybrid_recommendations(customer_id, top_n=15, weight_customer=0.2, weight_item=0.8):
    # Get top similar customers
    similar_customers = customer_sim_df[customer_id].sort_values(ascending=False).iloc[1:6]  # Exclude self
    
    # Get items purchased by similar customers
    customer_based_scores = customer_item_matrix.loc[similar_customers.index].sum().sort_values(ascending=False)

    # Get items similar to those the customer already bought
    purchased_items = customer_item_matrix.loc[customer_id]
    purchased_items = purchased_items[purchased_items > 0].index  # Get list of purchased items
    
    item_based_scores = item_sim_df[purchased_items].sum(axis=1).sort_values(ascending=False)
    
    # Combine scores
    hybrid_scores = (weight_customer * customer_based_scores) + (weight_item * item_based_scores)
    hybrid_scores = hybrid_scores.sort_values(ascending=False)

    # Remove items the customer has already purchased
    hybrid_scores = hybrid_scores.drop(purchased_items, errors="ignore")

    return hybrid_scores.head(top_n)

# Example: Get recommendations for a specific customer
customer_id = 12395  # Replace with any customer ID
recommendations = get_hybrid_recommendations(customer_id)
print(f"Top Recommendations for Customer {customer_id}:\n", recommendations)


Top Recommendations for Customer 12395:
 StockCode
22243    442.360179
22595    406.228011
22492    364.749264
22554    341.847674
22328    339.772136
20712    337.718462
23204    311.118586
20726    297.573005
22951    289.943104
20725    288.492152
22382    272.216166
23205    264.987872
35961    262.087406
22131    237.267453
84991    233.864170
dtype: float64


In [14]:
def get_price_adjusted_recommendations(customer_id, top_n=10, price_weight=0.03):
    # Get hybrid recommendations
    hybrid_scores = get_hybrid_recommendations(customer_id, top_n=20)

    # Ensure unique StockCode prices
    df_price = df.groupby('StockCode')['UnitPrice'].median()

    # Compute average spending of the customer
    customer_purchases = customer_item_matrix.loc[customer_id]
    avg_spending = (customer_purchases * df_price).sum() / customer_purchases.sum()

    # Compute price scores
    price_scores = 1 / (1 + abs(df_price - avg_spending))

    # Align indexes correctly
    valid_index = hybrid_scores.index.intersection(df_price.index)
    price_scores = price_scores.reindex(valid_index, fill_value=0)
    hybrid_scores = hybrid_scores.reindex(valid_index, fill_value=0)

    # Combine hybrid and price-based scores
    final_scores = (1 - price_weight) * hybrid_scores + price_weight * price_scores
    final_scores = final_scores.sort_values(ascending=False)

    recommendations_df = final_scores.reset_index().rename(columns={0: 'Score'})

    # Merge with item descriptions
    # descriptions = df[['StockCode', 'Description']].drop_duplicates().set_index('StockCode')
    # recommendations_df = recommendations_df.merge(descriptions, left_on='StockCode', right_index=True, how='left')

    return recommendations_df

customer_id = 12395
recommendations = get_price_adjusted_recommendations(customer_id)
print(recommendations)


    StockCode       Score
0       22243  429.116027
1       22595  394.059087
2       22492  353.822791
3       22554  331.618897
4       22328  329.591341
5       20712  327.606194
6       23204  301.802944
7       20726  288.672469
8       22951  281.260005
9       20725  279.864041
10      22382  264.076335
11      23205  257.056152
12      35961  254.242700
13      22131  230.170474
14      84991  226.863439
15      20723  225.493090
16      21390  225.323454
17      22467  224.358438
18      22908  222.575265
19      22489  217.816332


In [16]:
def precision_recall_at_k(actual_items, recommended_items):
    recommended_top_k = recommended_items
    relevant_items = set(actual_items)  # Items the user actually purchased

    hits = len(set(recommended_top_k) & relevant_items)
    precision = hits 
    recall = hits / len(relevant_items) if relevant_items else 0

    return precision, recall

# Example usage
customer_id = 17602 # Replace with an actual customer ID
# df = df.reset_index()
actual_purchases = df[df['CustomerID'] == customer_id]['StockCode'].unique()  # Items actually bought
recommended_items = get_price_adjusted_recommendations(customer_id, top_n=10).index.tolist()

precision, recall = precision_recall_at_k(actual_purchases, recommended_items)
print(f"Precision@5: {precision:.4f}, Recall@5: {recall:.4f}")


Precision@5: 0.0000, Recall@5: 0.0000


In [22]:
def get_price_adjusted_recommendations(customer_id, top_n=10, price_weight=0.3, repeat_weight=0.1):
    # Get hybrid recommendations
    hybrid_scores = get_hybrid_recommendations(customer_id, top_n=20)

    # Ensure unique StockCode prices
    df_price = df.groupby('StockCode')['UnitPrice'].median()

    # Compute average spending of the customer
    customer_purchases = customer_item_matrix.loc[customer_id]
    avg_spending = (customer_purchases * df_price).sum() / customer_purchases.sum()

    # Compute price scores
    price_scores = 1 / (1 + abs(df_price - avg_spending))

    # Compute repeat purchase scores (higher for past purchases)
    past_purchases = set(df[df['CustomerID'] == customer_id]['StockCode'])
    repeat_scores = pd.Series(0, index=hybrid_scores.index)
    repeat_scores.loc[list(past_purchases & set(hybrid_scores.index))] = 1

    # Align indexes correctly
    valid_index = hybrid_scores.index.intersection(df_price.index)
    price_scores = price_scores.reindex(valid_index, fill_value=0)
    hybrid_scores = hybrid_scores.reindex(valid_index, fill_value=0)
    repeat_scores = repeat_scores.reindex(valid_index, fill_value=0)

    # Combine hybrid, price, and repeat purchase scores
    final_scores = (1 - price_weight - repeat_weight) * hybrid_scores + \
                   price_weight * price_scores + repeat_weight * repeat_scores
    final_scores = final_scores.sort_values(ascending=False)

    return final_scores.head(top_n)

In [23]:
customer_id = 12348
recommendations = get_price_adjusted_recommendations(customer_id)
print(recommendations)


StockCode
21986     183.328866
23309      21.957187
21212      20.754768
21975      19.872988
21210      16.793933
15056N     12.729076
22417      11.699141
23307      11.243603
22728       7.304752
22726       7.214365
dtype: float64


In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Get descriptions of purchased and recommended items
purchased_descriptions = df[df['StockCode'].isin(actual_purchases)]['Description'].dropna().unique()
recommended_descriptions = df[df['StockCode'].isin(recommendations.StockCode)]['Description'].dropna().unique()

# Convert descriptions to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(list(purchased_descriptions) + list(recommended_descriptions))

# Compute cosine similarity between purchased and recommended items
similarity_matrix = cosine_similarity(tfidf_matrix[:len(purchased_descriptions)], tfidf_matrix[len(purchased_descriptions):])
avg_similarity = similarity_matrix.mean()  # Overall similarity score

print(f"Average similarity between past purchases and recommended items: {avg_similarity:.2f}")


Average similarity between past purchases and recommended items: 0.02


# Improve recommendations using the Description

In [45]:
import numpy as np
from gensim.models import Word2Vec

# Train Word2Vec on product descriptions
df['Description'] = df['Description'].astype(str)  # Ensure descriptions are strings
descriptions = [desc.split() for desc in df['Description'].dropna().unique()]
w2v_model = Word2Vec(sentences=descriptions, vector_size=50, window=5, min_count=1, workers=4)

# Create a dictionary mapping StockCode → item_vector
item_vector_dict = {}
for stock_code, desc in df[['StockCode', 'Description']].dropna().values:
    words = desc.split()
    vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if vectors:
        item_vector_dict[stock_code] = np.mean(vectors, axis=0)  # Average word vectors for the product

# Assign vectors to the DataFrame efficiently
df['item_vector'] = df['StockCode'].map(item_vector_dict)
df = df[df['item_vector'].notna()]  # Remove rows without vectors

# Convert the column into a NumPy array
item_vectors_matrix = np.vstack(df['item_vector'].values)



In [44]:
df['item_vector'].head()

0    [0.01814923, -0.016774353, -0.0045392537, -0.0...
1    [0.0037160965, 0.002281434, 0.01178594, 0.0071...
2    [0.014744875, -0.0029513403, 0.004356085, -0.0...
3    [0.023867821, 0.0066533685, 0.0030053717, -0.0...
4    [0.006448096, 6.395867e-05, 0.0025584945, -0.0...
Name: item_vector, dtype: object

In [57]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Create a mapping of StockCode to row index in item_vectors_matrix
stockcode_to_index = {stock: i for i, stock in enumerate(df['StockCode'].values)}

# Convert the column into a NumPy array
item_vectors_matrix = np.vstack(df['item_vector'].values)

def get_item_vector(item):
    index = stockcode_to_index.get(item)
    if index is not None:
        return item_vectors_matrix[index]
    return None


def get_price_adjusted_recommendations(customer_id, top_n=10, price_weight=0.05, text_weight=0.3):
    # Get hybrid recommendations
    hybrid_scores = get_hybrid_recommendations(customer_id, top_n=20)

    # Compute price scores
    df_price = df.groupby('StockCode')['UnitPrice'].median()
    customer_purchases = customer_item_matrix.loc[customer_id]
    avg_spending = (customer_purchases * df_price).sum() / customer_purchases.sum()
    price_scores = 1 / (1 + abs(df_price - avg_spending))

    # Align price_scores to match hybrid_scores
    price_scores = price_scores.reindex(hybrid_scores.index).fillna(0)

    # Get actual purchases for the customer
    actual_purchases = customer_purchases[customer_purchases > 0].index.tolist()

    # Compute purchased vectors efficiently
    purchased_vectors = np.array([get_item_vector(item) for item in actual_purchases if get_item_vector(item) is not None])

    # Compute recommended vectors efficiently
    recommended_vectors = np.array([get_item_vector(item) for item in hybrid_scores.index if get_item_vector(item) is not None])

    # Ensure both purchased_vectors and recommended_vectors are not empty
    if purchased_vectors.size > 0 and recommended_vectors.size > 0:
        purchased_mean_vector = purchased_vectors.mean(axis=0).reshape(1, -1)
        text_scores = cosine_similarity(purchased_mean_vector, recommended_vectors).flatten()
        if text_scores.max() > text_scores.min():  
            text_scores = (text_scores - text_scores.min()) / (text_scores.max() - text_scores.min())   # Normalize
        else:
            text_scores = np.zeros_like(text_scores)  # Avoid NaNs
    else:
        text_scores = np.zeros(len(hybrid_scores))  # If no vectors available, set text_scores to zero

    # Combine hybrid, price, and text scores
    final_scores = (1 - price_weight - text_weight) * hybrid_scores + price_weight * price_scores + text_weight * text_scores
    final_scores = final_scores.sort_values(ascending=False)

    return final_scores.head(top_n)

# Test the function
customer_id = 13402

customer_purchases = customer_item_matrix.loc[customer_id]
actual_purchases = customer_purchases[customer_purchases > 0].index.tolist()

new_recommendations = get_price_adjusted_recommendations(customer_id, top_n=10).index.tolist()

# Get item vectors for purchased and recommended items
new_purchased_vectors = np.array([get_item_vector(item) for item in actual_purchases if get_item_vector(item) is not None])
new_recommended_vectors = np.array([get_item_vector(item) for item in new_recommendations if get_item_vector(item) is not None])

# Ensure vectors are not empty before computing similarity
if new_purchased_vectors.size > 0 and new_recommended_vectors.size > 0:
    new_avg_similarity = np.mean([cosine_similarity(pv.reshape(1, -1), rv.reshape(1, -1))[0, 0]
                                  for pv in new_purchased_vectors for rv in new_recommended_vectors])
else:
    new_avg_similarity = 0  # Default value if no vectors are available

print(f"New average similarity: {new_avg_similarity:.2f}")

New average similarity: 0.88


In [48]:
get_price_adjusted_recommendations(12348)

StockCode
21986     122.589650
23309      15.024347
21212      14.221993
21975      13.631525
21210      11.552765
15056N      8.502076
22417       8.184379
23307       7.881808
22728       5.174128
22726       5.102143
dtype: float64

In [58]:
new_recommendations = get_price_adjusted_recommendations(customer_id, top_n=10).index.tolist()

# Get item vectors for purchased and recommended items
new_purchased_vectors = np.array([get_item_vector(item) for item in actual_purchases if get_item_vector(item) is not None])
new_recommended_vectors = np.array([get_item_vector(item) for item in new_recommendations if get_item_vector(item) is not None])

# Ensure vectors are not empty before computing similarity
if new_purchased_vectors.size > 0 and new_recommended_vectors.size > 0:
    new_similarity_matrix = cosine_similarity(new_purchased_vectors, new_recommended_vectors)
    new_avg_similarity = new_similarity_matrix.mean()
else:
    new_avg_similarity = 0  # Default value if no vectors are available

print(f"New average similarity: {new_avg_similarity:.2f}")


New average similarity: 0.88


In [65]:
def get_query_vector(query):
    words = query.split()
    vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    return None

def get_recommendations_from_search(query, customer_id, top_n=10):
    query_vector = get_query_vector(query)  # Convert search query to vector
    if query_vector is None:
        print("Query vector is None, returning empty list.")
        return []

    # Get item vectors for unique StockCodes
    unique_items = df.drop_duplicates(subset='StockCode')
    item_vectors = np.array([get_item_vector(item) for item in unique_items['StockCode'] if get_item_vector(item) is not None])
    
    # Calculate cosine similarity
    item_similarities = cosine_similarity(query_vector.reshape(1, -1), item_vectors).flatten()
    
    # Add similarity scores to the unique items dataframe
    unique_items = unique_items.reset_index(drop=True)  # Reset index to align with similarities
    unique_items['similarity'] = item_similarities
    
    # Get top N recommendations
    search_based_recommendations = unique_items.sort_values(by='similarity', ascending=False).head(top_n)[['StockCode', 'Description']]
    
    return search_based_recommendations

customer_id = 13402
search_query = "SILVER TURQUOISE BOUDICCA"
recommendations = get_recommendations_from_search(search_query, customer_id)
print(recommendations)


     StockCode                          Description
3622    90162A   ANT SILVER TURQUOISE BOUDICCA RING
3067     23101        SILVER STARS TABLE DECORATION
3252     23223       CHRISTMAS TREE HANGING SILVER 
2130    90162C     ANT SILVER FUSCHIA BOUDICCA RING
393     35591T            TURQUOISE CHRISTMAS TREE 
3127     23225      CHERUB HEART DECORATION SILVER 
1495    90162D      ANT SILVER PURPLE BOUDICCA RING
3052     23102       SILVER HEARTS TABLE DECORATION
2140     20826                SILVER APERITIF GLASS
1494    90162B  ANT SILVER LIME GREEN BOUDICCA RING


In [63]:
print(recommendations)

     StockCode                          Description
2       84406B       CREAM CUPID HEARTS COAT HANGER
920      22136               LOVE HEART SOCK HANGER
839      22639        SET OF 4 NAPKIN CHARMS HEARTS
1803     21812        GARLAND WITH HEARTS AND BELLS
1456     85078    SCANDINAVIAN 3 HEARTS NAPKIN RING
541      21190            PINK HEARTS PAPER GARLAND
3655     23562  SET OF 6 RIBBONS PERFECTLY PRETTY  
631      22173   METAL 4 HOOK HANGER FRENCH CHATEAU
3455     23630   SET 10 CARDS HANGING BAUBLES 17080
397      22147           FELTCRAFT BUTTERFLY HEARTS
