In [1]:
import json
import numpy as np

### Load dataset

In [2]:
with open('dataset/smithsonian.json') as f:
    articles = json.load(f)

In [3]:
# Extract tags from articles
tags = []
for article in articles:
    tmp_arr = []
    article_tags = article.get('tags', '') if article.get('tags') else ''
    article_section = article.get('section', '') if article.get('section') else ''
    raw_tags = article_tags + ',' + article_section
    for tag in raw_tags.split(','):
        normalised_tag = tag.strip().lower()
        if normalised_tag != '':
            tmp_arr.append(normalised_tag)
    tags.append(' '.join(tmp_arr))

### Approach 1: TF-IDF

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Convert the tags to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tags)

In [5]:
# Function to generate recommendations for a user given a list of tags using TF-IDF
def recommend(user_preference_tags, num_recommendations=5):
    user_preference_string = ', '.join(user_preference_tags)
    user_preference_vector = vectorizer.transform([user_preference_string])
    similarities = cosine_similarity(user_preference_vector, tfidf_matrix)[0] # Cosine similarity return a list inside a list. We only need the first list.
    sorted_article_indices = np.argsort(similarities)
    top_article_indices = sorted_article_indices[-num_recommendations:]
    return [articles[i]["title"] for i in top_article_indices]

In [48]:
print(recommend(['art', 'history'], 5))

['How the U.S. Government Deployed Grandma Moses Overseas in the Cold War', 'With AI Art, Process Is More Important Than the Product', "The Striking New Artworks That Follow Rockefeller Center's Grand Tradition of Public Art", 'The Story of Charles Willson Peale’s Massive Mastodon', 'These Portraits Made a Bold Statement in 19th-Century America']


### Approach 2: Embeddings

In [7]:
import torch
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a text
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt')
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

In [14]:
# Function to generate recommendations for a user given a list of tags
def recommend(user_tags, article_tags, num_recommendations=5):

    user_tags_string = ', '.join(user_tags)
    user_tags_embeddings = get_bert_embeddings(user_tags_string)
    article_tags_string = [' '.join(tags) for tags in article_tags]
    article_embeddings = [get_bert_embeddings(tags) for tags in article_tags_string]

    similarities = [
        cosine_similarity(user_tags_embeddings, article_embedding)[0][0] for article_embedding in article_embeddings
    ]
    sorted_article_indices = np.argsort(similarities)
    top_article_indices = sorted_article_indices[-num_recommendations:]
    return [articles[i]["title"] for i in top_article_indices]

In [18]:
recommend(['art', 'history'], tags[:100], 5)

['How Time, Space and Authority Figures Influence Your Moral Judgment',
 "The World's Best Natural Defense Against Climate Change May Soon Make Things Worse",
 'Americans Are Eating Later, and That May Contribute to Weight Troubles',
 'Biomedical Science Studies Are Shockingly Hard to Reproduce',
 "Here's Why Our Brains Trick Us Into Seeing Things"]