# General InfoDesk API interface. Created by Sidsel Boldsen & Louise Wille from Novo Nordisk

In [None]:
import os
import json
import pandas as pd
import requests
from dateutil.relativedelta import relativedelta

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Functions

In [None]:
def post_request(start_date, end_date, profile=None, channel=None, limit=1000):

    # Auth details. Novo Nordisk Credentials. Has to be changed to your own.
    username = ####
    password = ####

    api_key = ####
    realm = ####

    # Get Bearer token
    data = {"username":f"{username}","password":f"{password}"}
    headers = {'Content-Type': "application/json", "accept":"application/json",
               "X-API-KEY":f"{api_key}", 'realm':f"{realm}"}
    
    url = "https://api.infodesk.com/auth/v1/login?realm=infoauth.infodesk.com"

    bearer_token = json.loads(requests.post(url, json=data, headers=headers, verify=False).text)["access_token"]
    print(bearer_token)
    #bearer_token = "eyJ0eXAiOiJKV1QiLCJ6aXAiOiJERUYiLCJhbGciOiJSUzI1NiJ9.dVDLTsQgFP0X1sOUFgq0K3UzmcS4MO6MaXjVwWlhAoyPGP9d6DiJYyIrcs_jnnM_QTxK0IPN7fbambd45fyrdz5oG_dr5WewAuKobXoIQu2te97qTKatro1pMUQNE5BIjaFsMYWE0VbSkXedNFloY8zkXUqH2FdVjH5t3ei1OTn3hOAyrLw4pl1TBSOmOVbB-3T-F3oBL3TZOPm9cXdiNtleKGViHJbRGRrSx6FgN0YEE5YKabcJwqUlviEdbgknsGsbDYmoMeRdI2A9MsRlPSrD-Kl3JluXTHBi-pvByRH0NeWoZphitgJR-bL0EQg9W3fvp3KCWdgJPJ0CDMmWxL81S8-85N-m5v3wo-AcFYUV6dIiM2wwcbAuzwlB5a3AS7LZFivNacM41A0jkGhGoVQUwY4ShSTuJKcIfH0D.wEKa9m3e2yRVeKtAKGrioGK0mrRKd0zzfIXSrHPc1OyoAW3Ve4eK-CFlYsJFAA_qGegGzbuq7wjSu-nkZWzinXDxgngBzjBU2CMt6sjBRW2g0ejFgHJwQPTVfVIy939xmNQEK2JY2Z47untKeyktIajaytPchEzHI-uTolg00Mg"

    # Search
    url = 'https://api.infodesk.com/data/v1/search'
    headers = {'Content-Type': "application/json", 'Authorization': f"Bearer {bearer_token}",
               "accept":"*/*", "X-API-KEY":f"{api_key}", 'realm':f"{realm}"}

    data={"queryString":"",
          "searchOptions":{"startDate":start_date, "endDate":end_date,
                           "includeOldRevisions":False,},
                           "retrievalOptions":{"sortOrder":"NEWEST_FIRST",
                                               "limit":"1000",
                                               "start":"0",
                                               "grouping":"NONE",
                                               "fields":["HEADLINE","DESCRIPTION","LEAD","BYLINE","DOCTIME","TIMESTAMP","CONTENTCATEGORY","CREDITLINE","DATELINE","EDNOTE","GENRE","LANG","PROVIDER","SLUGLINE","ORIGINALURL","ORIGINALID","LOCATION","COUNTRY","ORGANIZATION","INDUSTRY","PUBLICATION","PUBDATE","KEYWORDS","TOPICS","DRUG","DRUGCLASS","MEDFORM","MOA","MEDINDICATION","MEDPHASE","GENETARGET","CONFERENCE"]}}

    if profile: data["profile"] = profile
    if channel: data["searchOptions"]["sources"] = [{"CHANNEL":channel},],
    if limit: data["retrievalOptions"]["limit"] = str(limit),

    print(url)
    print(data)
    print(headers)

    return requests.post(url, json=data, headers=headers, verify=False)
    #return "Error"


def parse(response):
    
    if response.status_code!=200:
        print(f"Request not successful {response.status_code}: '{response.text}'")
        return []
    
    try:
        results = json.loads(response.text)["resultSet"]["results"]
    except KeyError:
        results = []
    
    print(f"Request successful: 'Found {len(results)} data points'")
    
    return results

def get_data(start_date, end_date, **kwargs):

    results = [True]

    while len(results)!=0:
        response = post_request(start_date, end_date, **kwargs)

        results = parse(response)

        if len(results) > 0:
            end_date = results[-1]["date"]
            date_formatted = pd.to_datetime(end_date)

            # Subtract one month from end date
            prev_month = date_formatted - relativedelta(months=1)
            end_date = f"{prev_month.year:02d}-{prev_month.month:02d}-{prev_month.day:02d}T{prev_month.hour:02d}:{prev_month.minute:02d}:{prev_month.second:02d}.701Z"

            results = {
                "profile":profile,
                "results":results
            }

        else:
            date_formatted = pd.to_datetime(start_date)

            # Subtract one month from start date to get new end date
            prev_month = date_formatted - relativedelta(months=1)
            end_date = f"{date_formatted.year:02d}-{date_formatted.month:02d}-{date_formatted.day:02d}T{date_formatted.hour:02d}:{date_formatted.minute:02d}:{date_formatted.second:02d}.701Z"

            # Set start date to new date
            start_date = f"{prev_month.year:02d}-{prev_month.month:02d}-{prev_month.day:02d}T{prev_month.hour:02d}:{prev_month.minute:02d}:{prev_month.second:02d}.701Z"

            results = {
                "profile":profile,
                "results":results}

            break

        yield results, start_date, end_date



## Extract

#### From API

In [None]:
profiles = ('6qpp') #'clxj', #'nk93', #'tuc3', #'qkfq', #'duvj', #'uctf', #'u0vc', #'ggnd', #'xb2m', #'dofq', #'4hwz', #'5ls8', #'klhg', #'jmnc', #'kvkg', #'hj7v', #'xfdy', #'9cxo', #'jeh2', #'zojw', #'6qpp') #infomedia channels, change manually
start_date, end_date = "2018-01-01T00:01:00.701Z", "2023-05-01T00:01:00.701Z"
#data_directory = inset directory here

In [None]:
for profile in profiles:

    args = {
        "channel":None,
        "profile":profile,
        "limit":None,
    }

    for result, start_date, end_date in get_data(start_date, end_date, **args):

        format_end = pd.to_datetime(end_date)
        format_start = pd.to_datetime(start_date)

        file_path = os.path.join(data_directory, f"{profile}-{format_end.year:02d}-{format_end.month:02d}-{format_end.day:02d}.json")

        with open(file_path, "w") as f:
            f.write(json.dumps(result))

#### To CSV

In [None]:
name = "final_MEMO_data"
directory = os.fsencode(data_directory)

include_fields = {
    "description": "strings"
}

Concatenating all files into one dataframe

In [None]:
out = []

header = ["profile", "date"].extend(include_fields.keys())

for file in os.listdir(directory):

    filename = os.fsdecode(file)

    if filename.endswith(".json"):

        path= os.path.join(data_directory, filename)

        print(f"Reading file: {path}")

        with open(path, "r") as f:
            data = json.loads(f.read())

        results = data["results"]
        profile = data["profile"]

        for match in results:

            data = []

            raw_date = match["date"]

            data.append(str(pd.to_datetime(raw_date)))
            data.append(profile)

            inverted_fields = {}

            try: fields = match.get("fields")
            except KeyError: continue

            if not fields: continue
            for field in fields:

                field_id = field["id"]
                
                if field_id in include_fields:

                    inverted_fields[field_id] = field[include_fields[field_id]][0] # Method for extraction should be a lambda in the include fields value
                  
            
            data.extend(map(lambda idx: inverted_fields.get(idx, ""), include_fields))

            out.append(data)

file_path = os.path.join(data_directory, f"{name}")

with open(file_path, "w") as f:
    for m in out:
        try:
            f.write("\t".join(m)+"\n")
        
        except UnicodeEncodeError:
            print("Unicode error:", m)
            continue