## Extract articles from InfoDesk. Created in collaboration with Sidsel Boldsen & Louise Wille from Scientific Intelligence at Novo Nordisk

Load the Libraries

In [None]:
import os
import json
import pandas as pd
import requests
from dateutil.relativedelta import relativedelta

In [None]:
import warnings
warnings.filterwarnings("ignore")

Functions

In [None]:
def post_request(start_date, end_date, profile=None, channel=None, limit=1000):

    #auth details. Novo Nordisk Credentials. Has to be changed to your own.
    #username = ####
    #password = ####

    #api_key = ####
    #realm = ####

    # Get Bearer token
    data = {"username":f"{username}","password":f"{password}"}
    headers = {'Content-Type': "application/json", "accept":"application/json",
               "X-API-KEY":f"{api_key}", 'realm':f"{realm}"}
    
    url = "https://api.infodesk.com/auth/v1/login?realm=infoauth.infodesk.com"

    bearer_token = json.loads(requests.post(url, json=data, headers=headers, verify=False).text)["access_token"]
    print(bearer_token)
    #bearer_token = "eyJ0eXAiOiJKV1QiLCJ6aXAiOiJERUYiLCJhbGciOiJSUzI1NiJ9.dVDLTsQgFP0X1sOUFgq0K3UzmcS4MO6MaXjVwWlhAoyPGP9d6DiJYyIrcs_jnnM_QTxK0IPN7fbambd45fyrdz5oG_dr5WewAuKobXoIQu2te97qTKatro1pMUQNE5BIjaFsMYWE0VbSkXedNFloY8zkXUqH2FdVjH5t3ei1OTn3hOAyrLw4pl1TBSOmOVbB-3T-F3oBL3TZOPm9cXdiNtleKGViHJbRGRrSx6FgN0YEE5YKabcJwqUlviEdbgknsGsbDYmoMeRdI2A9MsRlPSrD-Kl3JluXTHBi-pvByRH0NeWoZphitgJR-bL0EQg9W3fvp3KCWdgJPJ0CDMmWxL81S8-85N-m5v3wo-AcFYUV6dIiM2wwcbAuzwlB5a3AS7LZFivNacM41A0jkGhGoVQUwY4ShSTuJKcIfH0D.wEKa9m3e2yRVeKtAKGrioGK0mrRKd0zzfIXSrHPc1OyoAW3Ve4eK-CFlYsJFAA_qGegGzbuq7wjSu-nkZWzinXDxgngBzjBU2CMt6sjBRW2g0ejFgHJwQPTVfVIy939xmNQEK2JY2Z47untKeyktIajaytPchEzHI-uTolg00Mg"

    # Search
    url = 'https://api.infodesk.com/data/v1/search'
    headers = {'Content-Type': "application/json", 'Authorization': f"Bearer {bearer_token}",
               "accept":"*/*", "X-API-KEY":f"{api_key}", 'realm':f"{realm}"}

    data={"queryString":"",
          "searchOptions":{"startDate":start_date, "endDate":end_date,
                           "includeOldRevisions":False,},
                           "retrievalOptions":{"sortOrder":"NEWEST_FIRST",
                                               "limit":"1000",
                                               "start":"0",
                                               "grouping":"NONE",
                                               "fields":["HEADLINE","DESCRIPTION","LEAD","BYLINE","DOCTIME","TIMESTAMP","CONTENTCATEGORY","CREDITLINE","DATELINE","EDNOTE","GENRE","LANG","PROVIDER","SLUGLINE","ORIGINALURL","ORIGINALID","LOCATION","COUNTRY","ORGANIZATION","INDUSTRY","PUBLICATION","PUBDATE","KEYWORDS","TOPICS","DRUG","DRUGCLASS","MEDFORM","MOA","MEDINDICATION","MEDPHASE","GENETARGET","CONFERENCE"]}}

    if profile: data["profile"] = profile
    if channel: data["searchOptions"]["sources"] = [{"CHANNEL":channel},],
    if limit: data["retrievalOptions"]["limit"] = str(limit),

    print(url)
    print(data)
    print(headers)

    return requests.post(url, json=data, headers=headers, verify=False)
    #return "Error"


def parse(response):
    
    if response.status_code!=200:
        print(f"Request not successful {response.status_code}: '{response.text}'")
        return []
    
    try:
        results = json.loads(response.text)["resultSet"]["results"]
    except KeyError:
        results = []
    
    print(f"Request successful: 'Found {len(results)} data points'")
    
    return results

def get_data(start_date, end_date, **kwargs):

    results = [True]

    while len(results)!=0:
        response = post_request(start_date, end_date, **kwargs)

        results = parse(response)

        if len(results) > 0:
            end_date = results[-1]["date"]
            date_formatted = pd.to_datetime(end_date)

            # Subtract one day from end date
            prev_day = date_formatted - relativedelta(days=1)
            end_date = f"{prev_day.year:02d}-{prev_day.month:02d}-{prev_day.day:02d}T{prev_day.hour:02d}:{prev_day.minute:02d}:{prev_day.second:02d}.701Z"

            results = {
                "profile":profile,
                "results":results
            }

        else:
            date_formatted = pd.to_datetime(start_date)

            # Subtract one day from start date to get new end date
            prev_day = date_formatted - relativedelta(days=1)
            end_date = f"{prev_day.year:02d}-{prev_day.month:02d}-{prev_day.day:02d}T{prev_day.hour:02d}:{prev_day.minute:02d}:{prev_day.second:02d}.701Z"

            # Set start date to new date
            start_date = f"{date_formatted.year:02d}-{date_formatted.month:02d}-{date_formatted.day:02d}T{date_formatted.hour:02d}:{date_formatted.minute:02d}:{date_formatted.second:02d}.701Z"

            results = {
                "profile":profile,
                "results":results
            }

            break

        yield results, start_date, end_date

Extract the new articles to MEMO using the set date time and profile from the main interface

In [None]:
from datetime import datetime, timedelta
import pytz

#read profile-data from the JSON file
with open('data/profiles.json', 'r') as d:
    profile_data = json.load(d)

profiles = profile_data["profiles"]

#read time-data from the JSON file
with open('data/dates.json', 'r') as f:
    date_data = json.load(f)

start_date, end_date = date_data["start_date"], date_data["end_date"]

#edit to change data directory
data_directory = "/Users/carljohanson/Desktop/Speciale - Code Project/Code/data/InfoDesk"

for profile in profiles:
    args = {
        "channel":None,
        "profile":profile,
        "limit":None,
    }

    for result, start_date, end_date in get_data(start_date, end_date, **args):
        format_end = pd.to_datetime(end_date)
        format_start = pd.to_datetime(start_date)
        file_path = os.path.join(data_directory, f"{profile}-{format_end.year:02d}-{format_end.month:02d}-{format_end.day:02d}.json")
        with open(file_path, "w") as f:
            f.write(json.dumps(result))