In [None]:
%pip install -q ipywidgets
%pip install pyclustering
%pip install -U sentence-transformers

In [3]:
import json
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

In [4]:
from sentence_transformers import SentenceTransformer, util
import json
import numpy as np

with open("articles.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

model = SentenceTransformer('all-MiniLM-L6-v2')

bias_prototypes = {
    "very left": "extremely progressive liberal views",
    "left": "liberal progressive opinions",
    "center": "balanced centrist views",
    "right": "conservative traditional opinions",
    "very right": "extremely conservative views"
}
sentiment_prototypes = {
    "positive": "excellent amazing positive good",
    "neutral": "average normal indifferent",
    "negative": "poor bad negative terrible"
}

bias_proto_embeddings = {label: model.encode(text, convert_to_tensor=True) for label, text in bias_prototypes.items()}
sentiment_proto_embeddings = {label: model.encode(text, convert_to_tensor=True) for label, text in sentiment_prototypes.items()}

analysis_results = []
article_embeddings = []

for article in articles:
    text = article["title"] + ". " + article.get("excerpt", "")
    embedding = model.encode(text, convert_to_tensor=True)
    article_embeddings.append(embedding.cpu().numpy().tolist())
    
    bias_scores = {label: util.cos_sim(embedding, proto).item() for label, proto in bias_proto_embeddings.items()}
    assigned_bias = max(bias_scores, key=bias_scores.get)
    
    sentiment_scores = {label: util.cos_sim(embedding, proto).item() for label, proto in sentiment_proto_embeddings.items()}
    assigned_sentiment = max(sentiment_scores, key=sentiment_scores.get)
    
    analysis_results.append({
        "article_id": article["_id"],
        "title": article["title"],
        "assigned_bias": assigned_bias,
        "assigned_sentiment": assigned_sentiment,
        "bias_scores": bias_scores,
        "sentiment_scores": sentiment_scores
    })

with open("analysis.json", "w", encoding="utf-8") as f:
    json.dump(analysis_results, f, indent=2)

with open("article_embeddings.json", "w", encoding="utf-8") as f:
    json.dump(article_embeddings, f, indent=2)

In [None]:
import random
from pyclustering.cluster.kmeans import kmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
import json
import numpy as np

with open("article_embeddings.json", "r", encoding="utf-8") as file:
    embeddings = json.load(file)

with open("articles.json", "r", encoding="utf-8") as file:
    articles = json.load(file)

data = np.array(embeddings).tolist()
clusters = 5

initial_centers = random.sample(data, clusters)
kmeans_instance = kmeans(data, initial_centers)
kmeans_instance.process()

clusters = kmeans_instance.get_clusters()
cluster_centers = kmeans_instance.get_centers()

cluster_mapping = {}
for idx, cluster in enumerate(clusters):
    cluster_ids = [articles[i]["_id"] for i in cluster]
    cluster_mapping[f"cluster_{idx}"] = cluster_ids

with open("clusters.json", "w", encoding="utf-8") as f:
    json.dump(cluster_mapping, f, indent=2)

In [None]:
title = widgets.HTML(value="<h1>Recreation of DiversiNews</h1>")
subtitle = widgets.HTML(value='<h3>Paper: <a href="https://dl.acm.org/doi/abs/10.14778/3685800.3685854" target="_blank">Link to Paper</a></h3>')

heading = widgets.VBox([title, subtitle])

with open("./articles.json", "r", encoding="utf-8") as f:
    articles = json.load(f)
    
with open("./analysis.json", "r", encoding="utf-8") as f:
    analysis = json.load(f)

with open("./clusters.json", "r", encoding="utf-8") as f:
    clusters = json.load(f)

# domains_file = "./domains.json"
# with open(domains_file, "r", encoding="utf-8") as f:
#     domains = json.load(f)

analysis_dict = { entry["article_id"]: entry for entry in analysis }
merged_articles = []
for article in articles:
    art_id = article["_id"]
    if art_id in analysis_dict:
        merged = article.copy()
        merged.update(analysis_dict[art_id])
        merged_articles.append(merged)
    else:
        merged_articles.append(article)
        
# "very left", "left", "center", "right", "very right".
bias_mapping = {
    "very left": -2,
    "left": -1,
    "center": 0,
    "neutral": 0,
    "right": 1,
    "very right": 2
}
for art in merged_articles:
    a_bias = art.get("assigned_bias", "center").lower()
    art["bias_numeric"] = bias_mapping.get(a_bias, 0)

# def on_topic_button_click(b):
#     selected_topic_container["value"] = b.description.split()[0]
#     print(selected_topic_container["value"])
#     update_articles(None)

# topic_buttons = []
# for cluster_name, article_ids in clusters.items():
#     # Format the button description as "cluster_0 (N)" where N is the number of articles.
#     btn = widgets.Button(
#         description=f"{cluster_name} ({len(article_ids)})",
#         layout=widgets.Layout(width="200px")
#     )
#     btn.on_click(on_topic_button_click)
#     print(f"Button {btn.description} bound to click event.")
#     topic_buttons.append(btn)
# show_all_btn = widgets.Button(description="Show All Topics", layout=widgets.Layout(width="200px"))
# topic_box = widgets.HBox([show_all_btn] + topic_buttons)

# selected_topic_container = {"value": None}

label_widget = widgets.HTML(value="<h4>Search by political leaning::</h4>")

bias_slider = widgets.IntSlider(
    value=0,
    min=-2,
    max=2,
    step=1,
    continuous_update=False,
    readout=False,
    layout=widgets.Layout(width='400px')
)
left_label = widgets.Label(value="very left", layout=widgets.Layout(width='60px'))
right_label = widgets.Label(value="very right", layout=widgets.Layout(width='60px'))
slider_with_labels = widgets.HBox([left_label, bias_slider, right_label])

spacer = widgets.HTML(value="<div style='margin-top: 20px;'></div>")

# sentiment_descriptor_label = widgets.Label(value=get_sentiment_descriptor(sentiment_slider.value))
# widgets.HTML(
#     value=f"<b>{get_sentiment_descriptor(sentiment_slider.value)}</b>"
# )

articles_output = widgets.Output()

def update_articles(change):
    articles_output.clear_output()
    current_bias = bias_slider.value
    
    # # If a topic (cluster) has been selected, filter merged_articles based on _id.
    # if selected_topic_container["value"] is not None:
    #     topic_article_ids = clusters.get(selected_topic_container["value"], [])
    #     filtered_articles = [art for art in merged_articles if art["_id"] in topic_article_ids]
    #     print("Topic Article IDs:", topic_article_ids)
    #     print("Filtered Articles Count:", len(filtered_articles))
    # else:
    #     filtered_articles = merged_articles

    
    sorted_articles = sorted(merged_articles, key=lambda a: abs(a["bias_numeric"] - current_bias))
    top_articles = sorted_articles[:10]

    with articles_output:
        for article in top_articles:
            display(HTML(
                f"<div style='padding-bottom:20px;'>"
                f"  <div><strong>{art['title']}</strong></div>"
                f"  <div><a href='{art['_id']}' target='_blank'>{art['_id']}</a></div>"
                f"  <div>{art['excerpt']}</div>"
                f"  <div style='font-size:small; color:gray;'>"
                f"    Assigned Leaning: {art.get('assigned_bias', 'center').capitalize()}, "
                f"Sentiment: {art.get('assigned_sentiment', 'neutral').capitalize()}"
                f"  </div>"
                f"</div>"
            ))


bias_slider.observe(update_articles, names='value')

# def on_show_all_click(b):
#     selected_topic_container["value"] = None
#     update_articles(None)
    
# show_all_btn.on_click(on_show_all_click)

update_articles({'new': bias_slider.value})

ui = widgets.VBox([
    heading,
    # topic_box,
    slider_with_labels,
    spacer,
    articles_output
])
display(ui)

In [None]:
import json
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

with open("./articles.json", "r", encoding="utf-8") as f:
    articles = json.load(f)
    
with open("./analysis.json", "r", encoding="utf-8") as f:
    analysis = json.load(f)

with open("./clusters.json", "r", encoding="utf-8") as f:
    clusters = json.load(f)

analysis_dict = { entry["article_id"]: entry for entry in analysis }

merged_articles = []
for article in articles:
    art_id = article["_id"]
    if art_id in analysis_dict:
        merged = article.copy()
        merged.update(analysis_dict[art_id])
        merged_articles.append(merged)
    else:
        merged_articles.append(article)

# "very left", "left", "center", "right", "very right".
bias_mapping = {
    "very left": -2,
    "left": -1,
    "center": 0,
    "neutral": 0,
    "right": 1,
    "very right": 2
}
for art in merged_articles:
    a_bias = art.get("assigned_bias", "center").lower()
    art["bias_numeric"] = bias_mapping.get(a_bias, 0)

title = widgets.HTML(value="<h1>Recreation of DiversiNews</h1>")
subtitle = widgets.HTML(value='<h3>Paper: <a href="https://dl.acm.org/doi/abs/10.14778/3685800.3685854" target="_blank">Link to Paper</a></h3>')
heading = widgets.VBox([title, subtitle])

# predefined_topics = {
#     "Politics": ["government", "policy", "political", "election"],
#     "Sports": ["sports", "game", "team", "athlete"],
#     "Environment": ["climate", "environment", "nature", "sustainability"],
#     "Technology": ["tech", "innovation", "science", "digital"],
# }

# # Function to assign a topic based on predefined categories
# def assign_predefined_topic(cluster_articles):
#     texts = " ".join([art["title"] + " " + art["excerpt"] for art in cluster_articles]).lower()
#     topic_scores = {topic: sum(texts.count(word) for word in keywords) for topic, keywords in predefined_topics.items()}
#     return max(topic_scores, key=topic_scores.get)

# # Assign topics to each cluster
# cluster_topics = {}
# for cluster_name, article_ids in clusters.items():
#     cluster_articles = [art for art in merged_articles if art["_id"] in article_ids]
#     cluster_topics[cluster_name] = assign_predefined_topic(cluster_articles)

from sklearn.feature_extraction.text import TfidfVectorizer

# Function to extract keywords from articles in a cluster
def extract_keywords(cluster_articles):
    texts = [art["title"] + " " + art["excerpt"] for art in cluster_articles]
    vectorizer = TfidfVectorizer(max_features=1, stop_words='english')
    X = vectorizer.fit_transform(texts)
    keywords = vectorizer.get_feature_names_out()
    return ", ".join(keywords)
cluster_topics = {}
for cluster_name, article_ids in clusters.items():
    cluster_articles = [art for art in merged_articles if art["_id"] in article_ids]
    cluster_topics[cluster_name] = (extract_keywords(cluster_articles)).capitalize()

topic_buttons = []
for cluster_name, article_ids in clusters.items():
    btn = widgets.Button(
        description=f"{cluster_topics[cluster_name]} ({len(article_ids)})",
        layout=widgets.Layout(width="200px")
    )
    btn.cluster_key = cluster_name
    topic_buttons.append(btn)
show_all_btn = widgets.Button(description="Show All Topics", layout=widgets.Layout(width="200px"))
topic_box = widgets.HBox([show_all_btn] + topic_buttons)

selected_topic_container = {"value": None}

bias_slider = widgets.IntSlider(
    value=0,
    min=-2,
    max=2,
    step=1,
    continuous_update=False,
    readout=False,
    layout=widgets.Layout(width='400px')
)
left_label = widgets.Label(value="very left", layout=widgets.Layout(width='60px'))
right_label = widgets.Label(value="very right", layout=widgets.Layout(width='60px'))
slider_with_labels = widgets.HBox([left_label, bias_slider, right_label])

def get_bias_label(val):
    mapping = {-2: "very left", -1: "left", 0: "neutral", 1: "right", 2: "very right"}
    return mapping.get(val, "neutral")
bias_descriptor = widgets.HTML(value=f"<b>{get_bias_label(bias_slider.value)}</b>")

spacer = widgets.HTML(value="<div style='margin-top: 20px;'></div>")

articles_output = widgets.Output()

def update_articles(change):
    articles_output.clear_output()
    current_bias = bias_slider.value
    bias_descriptor.value = f"<b>{get_bias_label(current_bias)}</b>"
    
    if selected_topic_container["value"] is not None:
        topic_article_ids = clusters.get(selected_topic_container["value"], [])
        filtered_articles = [art for art in merged_articles if art["_id"] in topic_article_ids]
    else:
        filtered_articles = merged_articles
    
    sorted_articles = sorted(filtered_articles, key=lambda a: abs(a["bias_numeric"] - current_bias))
    top_articles = sorted_articles[:10]
    
    with articles_output:
        for art in top_articles:
            display(HTML(
                f"<div style='padding-bottom:20px;'>"
                f"  <div><strong>{art['title']}</strong></div>"
                f"  <div><a href='{art['_id']}' target='_blank'>{art['_id']}</a></div>"
                f"  <div>{art['excerpt']}</div>"
                f"  <div style='font-size:small; color:gray;'>"
                f"    Assigned Leaning: {art.get('assigned_bias', 'center').capitalize()}, "
                f"Sentiment: {art.get('assigned_sentiment', 'neutral').capitalize()}"
                f"  </div>"
                f"</div>"
            ))

bias_slider.observe(update_articles, names='value')

def on_topic_button_click(b):
    selected_topic_container["value"] = b.cluster_key
    update_articles(None)

def on_show_all_click(b):
    selected_topic_container["value"] = None
    update_articles(None)

for btn in topic_buttons:
    btn.on_click(on_topic_button_click)
show_all_btn.on_click(on_show_all_click)

update_articles({'new': bias_slider.value})

ui = widgets.VBox([
    heading,
    topic_box,
    slider_with_labels,
    spacer,
    articles_output
])
display(ui)

VBox(children=(VBox(children=(HTML(value='<h1>Recreation of DiversiNews</h1>'), HTML(value='<h3>Paper: <a href…