In [31]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
with open('dataset/smithsonian.json') as f:
    articles = json.load(f)

In [33]:
# Extract tags from articles
tags = []
for article in articles:
    tmp_arr = []
    article_tags = article.get('tags', '') if article.get('tags') else ''
    article_section = article.get('section', '') if article.get('section') else ''
    raw_tags = article_tags + ',' + article_section
    for tag in raw_tags.split(','):
        normalised_tag = tag.strip().lower()
        if normalised_tag != '':
            tmp_arr.append(normalised_tag)
    tags.append(' '.join(tmp_arr))

In [34]:
# Convert the tags to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tags)

In [47]:
# Function to generate recommendations for a user given a list of tags using TF-IDF
def recommend(user_preference_tags, num_recommendations=5):
    user_preference_string = ', '.join(user_preference_tags)
    user_preference_vector = vectorizer.transform([user_preference_string])
    similarities = cosine_similarity(user_preference_vector, tfidf_matrix)[0] # Cosine similarity return a list inside a list. We only need the first list.
    sorted_article_indices = np.argsort(similarities)
    top_article_indices = sorted_article_indices[-num_recommendations:]
    return [articles[i]["title"] for i in top_article_indices]

In [48]:
print(recommend(['art', 'history'], 5))

['How the U.S. Government Deployed Grandma Moses Overseas in the Cold War', 'With AI Art, Process Is More Important Than the Product', "The Striking New Artworks That Follow Rockefeller Center's Grand Tradition of Public Art", 'The Story of Charles Willson Peale’s Massive Mastodon', 'These Portraits Made a Bold Statement in 19th-Century America']
