In [1]:
import json
import numpy as np

### Load dataset

In [2]:
with open('dataset/smithsonian.json') as f:
    articles = json.load(f)

In [3]:
# Extract tags from articles
tags = []
for article in articles:
    tmp_arr = []
    article_tags = article.get('tags', '') if article.get('tags') else ''
    article_section = article.get('section', '') if article.get('section') else ''
    raw_tags = article_tags + ',' + article_section
    for tag in raw_tags.split(','):
        normalised_tag = tag.strip().lower()
        if normalised_tag != '':
            tmp_arr.append(normalised_tag)
    tags.append(' '.join(tmp_arr))

### Approach 1: TF-IDF

In [37]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
# Convert the tags to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tags)

In [57]:
# Function to generate recommendations for a user given a list of tags using TF-IDF
def tfidf_recommend(user_preference_tags, num_recommendations=5):
    user_preference_string = ', '.join(user_preference_tags)
    user_preference_vector = vectorizer.transform([user_preference_string])
    similarities = cosine_similarity(user_preference_vector, tfidf_matrix)[0] # Cosine similarity return a list inside a list. We only need the first list.
    sorted_article_indices = np.argsort(similarities)
    top_article_indices = sorted_article_indices[-num_recommendations:]
    return [articles[i]["title"] for i in top_article_indices]

In [58]:
def tfidf_recommend_weighted(user_model, num_recommendations=5):
    user_preference_tags = [x['tag'] for x in user_model]
    weights = [int(float(x['weight'])*100) for x in user_model]
    weighted_tags = []
    for tag, weight in zip(user_preference_tags, weights):
        weighted_tags.extend([tag]*weight)
    user_preference_string = ', '.join(weighted_tags)
    user_preference_vector = vectorizer.transform([user_preference_string])
    similarities = cosine_similarity(user_preference_vector, tfidf_matrix)[0]
    sorted_article_indices = np.argsort(similarities)
    top_article_indices = sorted_article_indices[-num_recommendations:]
    return [articles[i]["title"] for i in top_article_indices]

### Approach 2: Embeddings

In [9]:
import torch
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a text
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt')
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

#### Create embeddings from dataset

In [20]:
articles_with_embeddings = articles.copy()

In [21]:
from tqdm import tqdm
pbar = tqdm(total=len(articles_with_embeddings))
# Create BERT embeddings for all articles
for tag, article_dict in zip(tags, articles_with_embeddings):
    embedding = get_bert_embeddings(tag)
    article_dict["embedding"] = embedding.tolist()
    pbar.update(1)
pbar.close()

100%|██████████| 2200/2200 [00:46<00:00, 47.13it/s]


In [22]:
with open('dataset/smithsonian_with_embeddings.json', 'w') as f:
    json.dump(articles_with_embeddings, f)

### Generate recommendations

In [47]:
def get_weighted_bert_embeddings(tags, weights):
    weighted_embeddings = []
    for tag, weight in zip(tags, weights):
        tag_embedding = get_bert_embeddings(tag)
        weighted_embedding = tag_embedding * weight
        weighted_embeddings.append(weighted_embedding)
    return sum(weighted_embeddings)

In [60]:
# Function to generate recommendations for a user given a list of tags
def embeddings_recommend_weighted(user_model, article_embeddings, num_recommendations=5):

    tags = [x['tag'] for x in user_model]
    weights = [x['weight'] for x in user_model]  
    user_tags_embeddings = get_weighted_bert_embeddings(tags, weights)

    similarities = [
        cosine_similarity(user_tags_embeddings, article_embedding)[0][0] for article_embedding in article_embeddings
    ]
    sorted_article_indices = np.argsort(similarities)
    top_article_indices = sorted_article_indices[-num_recommendations:]
    return [articles[i]["title"] for i in top_article_indices]

In [61]:
def embeddings_recommend(user_tags, article_embeddings, num_recommendations=5):
    user_tags_embeddings = get_bert_embeddings(' '.join(user_tags))
    similarities = [
        cosine_similarity(user_tags_embeddings, article_embedding)[0][0] for article_embedding in article_embeddings
    ]
    sorted_article_indices = np.argsort(similarities)
    top_article_indices = sorted_article_indices[-num_recommendations:]
    return [articles[i]["title"] for i in top_article_indices]

### Evaluate results

In [None]:
with open('dataset/smithsonian_with_embeddings.json') as f:
    articles_with_embeddings = json.load(f)

In [24]:
article_embeddings = [np.array(article["embedding"]) for article in articles_with_embeddings]

In [64]:
user_model = [
    {"tag": "architecture", "weight": 0.5},
    {"tag": "history", "weight": 0.3},
    {"tag": "art", "weight": 0.9},
    {"tag": "artificial intelligence", "weight": 0.1}
]
user_tags = [x['tag'] for x in user_model]

In [65]:
tfidf_recommend(user_tags, num_recommendations=5)

['The Computer Scientist Who Wants to Put a Name to Every Face in Civil War Photographs',
 'Can Artificial Intelligence Help Stop School Shootings?',
 "A New Encyclopedia Explores Europe's Smelly History",
 'With a Little Help From A.I., the Dali Museum Brings the Famed Surrealist to Life',
 'With AI Art, Process Is More Important Than the Product']

In [66]:
tfidf_recommend_weighted(user_model, num_recommendations=5)

['These Portraits Made a Bold Statement in 19th-Century America',
 'These Wild Sculptures Could Bring Sustainable Energy to the Desert',
 "The Striking New Artworks That Follow Rockefeller Center's Grand Tradition of Public Art",
 'With AI Art, Process Is More Important Than the Product',
 'Why Museums Don’t Need Gleaming New Buildings, Especially Not in Los Angeles']

In [67]:
embeddings_recommend(user_tags, article_embeddings, num_recommendations=5)

['The Medieval Origin Story of the Balcony',
 'Could We Chat With Whales?',
 'With AI Art, Process Is More Important Than the Product',
 'From Turrets to Toilets: A Partial History of the Throne Room',
 'The Computer Scientist Who Wants to Put a Name to Every Face in Civil War Photographs']

In [68]:
embeddings_recommend_weighted(user_model, article_embeddings, num_recommendations=5)

['Bye Bye Cassini, the Tenacious Space Probe That Revealed Saturn’s Secrets',
 'Ancient Cities Lost to the Seas',
 'The Path to Being a Scientist Doesn’t Have to Be So Narrow',
 'In Groundbreaking Find, Three Kinds of Early Humans Unearthed Living Together in South Africa',
 'Ancient Greece Springs to Life']