In [None]:
# in this notebook, we show how to query Wikidata with SPARQL, getting back a list of entities that we will use to
# create a News Signals dataset

# (1) get a list of US politicians from Wikidata
# (2) query for current news volumes and rank by anomalies
# (3) understand the main stories for current trending politicians

In [None]:
import requests
import pandas as pd
import requests
import datetime
import tqdm 

from news_signals import signals
from collections import OrderedDict

In [None]:
# SPARQL Query for living US politicians
sparql_query = """
SELECT DISTINCT ?politician ?label ?aliases ?desc
WHERE {
  ?politician wdt:P31 wd:Q5; # Human
              wdt:P106 wd:Q82955; # Occupation: Politician
              wdt:P27 wd:Q30; # Country of citizenship: United States
              wdt:P39 ?position. # Position held

  # National-level positions
  VALUES ?position {
    wd:Q11696 # President of the United States
    wd:Q11699 # Vice President of the United States
    wd:Q13218630 # Member of the United States House of Representatives
    wd:Q4416090 # United States Senator
  }

  FILTER NOT EXISTS { ?politician wdt:P570 ?deathDate. } # Filter out politicians with a death date

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
      ?politician rdfs:label ?label.
      ?politician skos:altLabel ?aliases.
      ?politician schema:description ?desc.
  }
}
ORDER BY ?label

"""

In [None]:
RESULT_POSTPROCESSORS = {
    'politician': lambda uri: uri.replace("http://www.wikidata.org/entity/",""),
    'aliases': lambda aliases: [a.strip() for a in aliases.split(', ')]
}

UNIQUE_FIELDS = ['politician']


def sparql2df(sparql_query, endpoint="https://query.wikidata.org/sparql?format=json"):
    res = requests.get(
        endpoint,
        params={"query": sparql_query}
    ).json()
    
    # parse results
    rows = []
    for r in res['results']['bindings']:
        row = {}
        for k, v in r.items():
            postprocess = RESULT_POSTPROCESSORS.get(k, lambda x: x)
            v = postprocess(v['value'])
            row[k] = v
            rows.append(row)
    
    return pd.DataFrame.from_records(rows).drop_duplicates(UNIQUE_FIELDS).reset_index()

In [None]:
# Ok let's get a dataframe of entities from the Wikidata Query Service
entity_df = sparql2df(sparql_query)
entity_df

In [None]:
# Aliases for Barack Obama
print([a for a in entity_df[entity_df['label'] == 'Barack Obama']['aliases']])

In [None]:
def entity2signal(entity):
    surface_forms = set([entity['label']])
    if type(entity['aliases']) is list:
        surface_forms.update(entity['aliases'])
    surface_forms = list(surface_forms)
    
    return signals.AylienSignal(
        name=entity['label'],
        params={
            'entity_surface_forms_text': surface_forms,
            'min_prominence_score': 0.7
        }
    )


In [None]:
label2signal = OrderedDict()

for record in entity_df.to_dict(orient='records'):
    label2signal[record['label']] = entity2signal(record)

In [None]:
# from this cell you need to have Aylien NewsAPI credentials set up

start = '2023-02-01'
end = '2023-04-03'

anomaly_detection_start = '2023-03-24'

# which politicians are currently anomalous with respect to news volume?
# init anomaly signals
# querying for ~1600 signals will take around 15 minutes
cutoff = 10
for label, signal in tqdm.tqdm(list(label2signal.items())[:cutoff]):
    label2signal[label] = signal(start, end).anomaly_signal(anomaly_detection_start, end)

In [None]:
list(label2signal.keys())[:cutoff]

In [None]:
def investigate_signal(signal):
    if signal[signal.ts_column].sum() == 0:
        print('Signal timeseries is always zero, nothing to investigate')
        return
    most_anomalous_day = signal['anomalies'].idxmax()
    _ = signal.sample_stories_in_window(
        start=most_anomalous_day,
        end=most_anomalous_day + datetime.timedelta(days=1)
    )
    daily_stories = signal.feeds_df['stories']
    title_dedup = set()
    for stories in daily_stories:
        if type(stories) is list:
            for s in stories:
                if s['title'] not in title_dedup:
                    print(s['title'])
                    # try to find a place where the entity was mentioned
                    sf_idx = s['body'].find(signal.name)
                    if sf_idx > -1:
                        sf_end_idx = len(signal.name) + sf_idx
                        print(f'{s["body"][sf_idx-50:sf_idx]} __{s["body"][sf_idx:sf_end_idx]}__ {s["body"][sf_end_idx:sf_end_idx+50]}')
                    title_dedup.update(s['title'])


In [None]:
for politician_name in list(label2signal.keys())[:cutoff]:
    print()
    print(f'Investigating {politician_name}')
    print(entity_df[entity_df['label'] == politician_name])
    investigate_signal(label2signal[politician_name])

In [None]:
from news_signals.signals_dataset import SignalsDataset


politicians_dataset = SignalsDataset(label2signal.values())

In [None]:
dataset_path = 'wikidata_US_politicians'
politicians_dataset.save(dataset_path)