In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Load Instacart dataset
orders = pd.read_csv('../../CSVs/orders.csv')
order_products = pd.read_csv('../../CSVs/order_products__train.csv')
products = pd.read_csv('../../CSVs/products.csv')

# Merge order datasets with necessary columns only
merged_orders = pd.merge(orders[['order_id', 'user_id']], order_products[['order_id', 'product_id']], on='order_id')
merged_df = pd.merge(merged_orders, products[['product_id', 'product_name']], on='product_id')

# Reducing the dataset size by selecting top N most purchased products and most active users
top_n_products = 1000 
top_n_users = 1000

# Filter the dataset for top products and users
top_products = merged_df['product_name'].value_counts().head(top_n_products).index
top_users = merged_df['user_id'].value_counts().head(top_n_users).index

reduced_df = merged_df[merged_df['product_name'].isin(top_products) & merged_df['user_id'].isin(top_users)]

print(reduced_df.head(30))


      order_id  user_id  product_id              product_name
14     1100193     1952         196                      Soda
97     2135355    12873         196                      Soda
397      15493    52336         196                      Soda
437     818852    58237         196                      Soda
446    3337473    58933         196                      Soda
453    2554229    59752         196                      Soda
577    3202695    76472         196                      Soda
704     578904    94268         196                      Soda
825    1888301   111713         196                      Soda
869    2207390   116342         196                      Soda
909    1089943   122977         196                      Soda
1077   3223024   146413         196                      Soda
1246   1574405   170974         196                      Soda
1437   2488407   194980         196                      Soda
1502    584866   204975         196                      Soda
1764   1

In [2]:
import numpy as np

# Create a user-item interaction matrix
interaction_matrix = reduced_df.pivot_table(index='user_id', columns='product_name', aggfunc='count', fill_value=0)

# Convert to a sparse matrix for efficiency
user_item_matrix = csr_matrix(interaction_matrix.values)

# Calculate cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)

# Convert similarity matrix to DataFrame for better handling
user_similarity_df = pd.DataFrame(user_similarity, index=interaction_matrix.index, columns=interaction_matrix.index)

# Function to get top N similar users for a given user
def get_similar_users(user_id, user_similarity_df, n=5):
    if user_id not in user_similarity_df.columns:
        return None, "User not in dataset."
    else:
        # Sort by similarity score
        similar_users = user_similarity_df[user_id].sort_values(ascending=False)
        # Exclude the user itself
        similar_users = similar_users[similar_users.index != user_id]
        return similar_users.head(n).index.tolist()

# Example: Get top 5 similar users for a specific user
similar_users = get_similar_users(1952, user_similarity_df)  # Replace 12345 with a valid user_id
print(similar_users)

# Function to recommend products
def recommend_products(user_id, user_similarity_df, interaction_matrix, n=5):
    similar_users = get_similar_users(user_id, user_similarity_df, n)
    if similar_users is None:
        return []
    
    # Get products bought by similar users
    similar_users_products = interaction_matrix.loc[similar_users]
    # Sum up the interactions
    product_scores = similar_users_products.sum(axis=0)
    # Exclude products already bought by the user
    bought_products = interaction_matrix.loc[user_id]
    product_scores = product_scores[bought_products == 0]

    # Get top N products
    recommendations = product_scores.sort_values(ascending=False).head(n).index.tolist()
    return recommendations

# Example: Recommend products for a specific user
recommended_products = recommend_products(1952, user_similarity_df, interaction_matrix)  # Replace 12345 with a valid user_id
print(recommended_products)


[82140, 54566, 130509, 186092, 79219]
[('product_id', 'Large Lemon'), ('order_id', 'Banana'), ('product_id', 'Banana'), ('order_id', 'Large Lemon'), ('product_id', 'Carrots')]


Building a Content Based Recommendation System for user's purchases on a grocery store.

In [3]:
import pandas as pd

# Load Instacart dataset
orders = pd.read_csv('../../CSVs/orders.csv')
order_products = pd.read_csv('../../CSVs/order_products__train.csv')
products = pd.read_csv('../../CSVs/products.csv')

In [4]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [5]:
order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [6]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [7]:
# Clean the orders dataset
orders = orders.drop(['eval_set', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order'], axis=1)
orders.head()

Unnamed: 0,order_id,user_id
0,2539329,1
1,2398795,1
2,473747,1
3,2254736,1
4,431534,1


In [8]:
# Clean the order_products dataset
order_products = order_products.drop('add_to_cart_order', axis=1)
order_products.head()

Unnamed: 0,order_id,product_id,reordered
0,1,49302,1
1,1,11109,1
2,1,10246,0
3,1,49683,0
4,1,43633,1


In [9]:
# Clean the products dataset
products = products.drop(['aisle_id','department_id'], axis=1)
products.head()

Unnamed: 0,product_id,product_name
0,1,Chocolate Sandwich Cookies
1,2,All-Seasons Salt
2,3,Robust Golden Unsweetened Oolong Tea
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...
4,5,Green Chile Anytime Sauce


In [10]:
# Merge order datasets with necessary columns only
merged_orders = pd.merge(orders[['order_id', 'user_id']], order_products[['order_id', 'product_id']], on='order_id')
merged_df = pd.merge(merged_orders, products[['product_id', 'product_name']], on='product_id')

merged_df.head()

Unnamed: 0,order_id,user_id,product_id,product_name
0,1187899,1,196,Soda
1,2757217,67,196,Soda
2,632715,676,196,Soda
3,1167274,760,196,Soda
4,3347074,804,196,Soda


In [11]:
# Reducing the dataset size by selecting top N most purchased products and most active users
top_n_products = 1000
top_n_users = 1000

# Filter the dataset for top products and users
top_products = merged_df['product_name'].value_counts().head(top_n_products).index
top_users = merged_df['user_id'].value_counts().head(top_n_users).index

reduced_df = merged_df[merged_df['product_name'].isin(top_products) & merged_df['user_id'].isin(top_users)]
reduced_df.head()

Unnamed: 0,order_id,user_id,product_id,product_name
14,1100193,1952,196,Soda
97,2135355,12873,196,Soda
397,15493,52336,196,Soda
437,818852,58237,196,Soda
446,3337473,58933,196,Soda


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english')

reduced_df['product_name'] = reduced_df['product_name'].fillna('')
reduced_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_df['product_name'] = reduced_df['product_name'].fillna('')


Unnamed: 0,order_id,user_id,product_id,product_name
14,1100193,1952,196,Soda
97,2135355,12873,196,Soda
397,15493,52336,196,Soda
437,818852,58237,196,Soda
446,3337473,58933,196,Soda


In [13]:
tfidf_matrix = vectorizer.fit_transform(reduced_df['product_name'])
tfidf_matrix

<20702x3753 sparse matrix of type '<class 'numpy.float64'>'
	with 137450 stored elements in Compressed Sparse Row format>

In [14]:
tfidf_matrix.shape

(20702, 3753)

In [15]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfidf_matrix, tfidf_matrix)

In [16]:
sig[0]

array([0.76170604, 0.76170604, 0.76170604, ..., 0.76159416, 0.76159416,
       0.76159416])

In [17]:
# Reverse mapping of indices and product names
indices = pd.Series(merged_df.index, index=merged_df['product_name']).drop_duplicates()

In [18]:
indices

product_name
Soda                                                                 0
Soda                                                                 1
Soda                                                                 2
Soda                                                                 3
Soda                                                                 4
                                                                ...   
Chewy Reduced Sugar Granola Bars Variety Pack                  1384612
Plain Flavor Probiotic Acidophilus                             1384613
100% Juice, Rio Red Grapefruit                                 1384614
Puppy Complete Nutrition Chicken & Beef Dinner Wet Dog Food    1384615
Organic Aromatherapeutic Moroccan Argan Oil Set                1384616
Length: 1384617, dtype: int64

In [19]:
import pandas as pd
import numpy as np

def give_rec(title, sig, indices, reduced_df):
    # Check if the title exists in the indices
    if title not in indices:
        return "Product not in dataset."
    
    # Get the index of the product that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all products with that product
    # and convert it into a list of tuples as described above
    sig_scores = list(enumerate(sig[idx]))
    
    # Sort the products based on the similarity scores
    # Here, you sort the list of tuples based on the similarity scores which is the second element of the tuple.
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    
    # Scores of the 10 most similar products (ignoring the first product itself)
    sig_scores = sig_scores[1:11]
    
    # Product indices
    # Here, you retrieve the indices of these similar products from the list of tuples.
    product_indices = [i[0] for i in sig_scores]
    
    # Top 10 most similar products
    # This fetches the product names using the pandas DataFrame iloc method. Since 'product_indices' is a list of numerical indices,
    # iloc will select the product names that are at these indices in the DataFrame.
    similar_products = reduced_df['product_name'].iloc[product_indices]
    
    return similar_products

# Assuming sig is already computed as a 2D numpy array of similarity scores
# and indices is a pandas Series mapping product names to index
# and reduced_df is a DataFrame with a 'product_name' column

# You can call the function like this:
# product_recommendations = give_rec('Soda', sig, indices, reduced_df)
# print(product_recommendations)


In [20]:
import pandas as pd
import numpy as np

def precision_at_k(actual, predicted, k):
    # Calculate Precision at K
    if len(predicted) == 0:
        return 0
    else:
        predicted = predicted[:k]
        return len(set(actual) & set(predicted)) / k

def recall_at_k(actual, predicted, k):
    # Calculate Recall at K
    if len(actual) == 0:
        return 0
    else:
        predicted = predicted[:k]
        return len(set(actual) & set(predicted)) / len(actual)

def mean_average_precision(actual, predicted):
    # Calculate Mean Average Precision (MAP)
    avg_precision = 0
    num_relevant = 0

    for i, p in enumerate(predicted):
        if p in actual:
            num_relevant += 1
            precision = num_relevant / (i + 1)
            avg_precision += precision
    
    if num_relevant == 0:
        return 0
    else:
        return avg_precision / num_relevant

def normalized_discounted_cumulative_gain(actual, predicted, k):
    # Calculate Normalized Discounted Cumulative Gain (NDCG) at K
    dcg = 0
    idcg = 0

    for i, p in enumerate(predicted[:k]):
        if p in actual:
            dcg += 1 / np.log2(i + 2)
        
    for i in range(min(k, len(actual))):
        idcg += 1 / np.log2(i + 2)
    
    if idcg == 0:
        return 0
    else:
        return dcg / idcg

def hit_rate(actual, predicted, k):
    # Calculate Hit Rate at K (binary metric)
    predicted = predicted[:k]
    return int(any(item in predicted for item in actual))

def mean_reciprocal_rank(actual, predicted):
    # Calculate Mean Reciprocal Rank (MRR)
    for i, p in enumerate(predicted):
        if p in actual:
            return 1 / (i + 1)
    return 0

# Example usage:
actual = [82140, 54566, 130509]  # Ground truth - relevant products
predicted = [82140, 54566, 130509, 12345, 67890]  # Recommended products
k = 5  # Value of K for evaluation

# Calculate metrics
precision = precision_at_k(actual, predicted, k)
recall = recall_at_k(actual, predicted, k)
map_score = mean_average_precision(actual, predicted)
ndcg = normalized_discounted_cumulative_gain(actual, predicted, k)
hit = hit_rate(actual, predicted, k)
mrr = mean_reciprocal_rank(actual, predicted)

# Print metrics
print(f"Precision@{k}: {precision}")
print(f"Recall@{k}: {recall}")
print(f"MAP: {map_score}")
print(f"NDCG@{k}: {ndcg}")
print(f"Hit Rate@{k}: {hit}")
print(f"MRR: {mrr}")


Precision@5: 0.6
Recall@5: 1.0
MAP: 1.0
NDCG@5: 1.0
Hit Rate@5: 1
MRR: 1.0
