In [None]:
import sys
import requests
import json
import csv
import datetime
import os
from google.cloud import bigquery
from google.cloud import storage
from google.cloud.bigquery import SchemaField
from google.cloud import secretmanager
import re
import numpy as np
import pandas as pd
import pandas_gbq
import io
import time
pd.set_option('display.max_columns', None)

In [None]:
PROJECT_ID = '170117011701' #polsnadna-nonprd'
secret_id = 'secret_survey_export_credentials'
version_id = 'latest'
bucket = "survey_exports"

In [11]:
def get_google_secret(secret_id, PROJECT_ID, version_id="latest"):
        
    # Create the Secret Manager client.
    client = secretmanager.SecretManagerServiceClient()

    # Build the resource name of the secret version.
    name = f"projects/{PROJECT_ID}/secrets/{secret_id}/versions/{version_id}"

    # Access the secret version.
    response = client.access_secret_version(name=name)

    # Return the decoded payload.
    return response.payload.data.decode('UTF-8')

def authenticate(url, json):
   
    print(f'1.a Authenticating')
    response = requests.post(url = url, json = json)
    print(f'1.b Status Code: {response.status_code}')
    return(response)

def post_reporting_request(url, params, token_dict):
    
    """
    Sends request to API to generate report.
    
    :param url: URL for request
    :param params: Dictionary, list of tuples or bytes to send
        in the query string.  Contains three key:value pairs    
        EXAMPLE:
            params={
              "start_date": "2022-06-01T00:00:00.000+00:00",
                "end_date": "2022-06-02T00:00:00.000+00:00",
                  "format": "JSON"
                    } 
    :param token_dict: Dictionary, lists authentication and time valid.
    
    :return: :class:`Response <Response>` object
    
    """
    
    print(f'2.a Posting request for {url}')
    
    token = str(token_dict['auth_token'])

    headers = {
       'accept': 'application/json',
        'Authorization': f"Bearer {token}"
        }

    response = requests.post(url=url, headers=headers, json=params)
    
    print(f'2.b Status Code: {response.status_code}')
    
    return(response)

def parse_request_id(response):
    
    if response.status_code == 400:
        
        desc = response.json()['error_description']
        
        print(desc)

        request_id = re.sub('^(.* )([A-Za-z0-9\-]+)','\\2',desc)
    
    elif response.status_code == 200:

        request_id = eval(response.text)['request_id']
        
    return(request_id)

def submit_report_request(endpoint,params,token_dict):
    
    report_req = post_reporting_request(url=endpoint, params=params, token_dict=token_dict)
    
    request_id = parse_request_id(response = report_req)
    
    return(request_id)


def get_report_job_uri(endpoint, request_id, token_dict):

    token = str(token_dict['auth_token'])

    headers_ = {
       'accept': 'application/json',
        'Authorization': f"Bearer {token}"
        }

    url = endpoint + request_id
    
    response = requests.get(url,headers=headers_)
    
    while eval(response.text)['status'] == "INPROGRESS":
        print('Report generation in progress.  Trying again in 5 seconds.')
        time.sleep(5)
    
        response = requests.get(url,headers=headers_)
        
    return(response)

def get_report_from_uri(response):
    
    report_url = eval(response.text)['url']
    
    report = requests.get(url=report_url)
    
    return(report)

def fully_flatten_json(df):
    
    #Catch unparseable columns here
    bad_cols = []
    
    #Conditional Loop - while there are any lists or dicts in columns, keep loop going.  Will ignore columns flagged as "bad"
    while\
        df[df.columns[~df.columns.isin(bad_cols)]].apply(lambda y: y.apply(lambda x: isinstance(x,list))).any().any()\
    or\
        df[df.columns[~df.columns.isin(bad_cols)]].apply(lambda y: y.apply(lambda x: isinstance(x,dict))).any().any():
    
    
        #Parses lists first.  This creates a list of columns with lists to "explode."
        cols = [x for x in df.columns if df[x].apply(lambda z: isinstance(z,list)).any()]

        for c in cols:
            
            print(f'Parsing lists in column {c}')
            
            try:
                
                #Joins exploded list to old dataframe
                df = pd.concat([df[c].explode(), df.drop(columns=[c])], axis=1).reset_index(drop=True)
                
            except:
                
                #If any issues present themselves, it just skips and adds to the list of bad columns
                print(f'Issue parsing {c}.  Skipping.')
                bad_cols.append(c)
                
                continue

            #Finding any dicts within the recently parsed lists
            if df.apply(lambda y: y.apply(lambda x: isinstance(x,dict))).any().any():

                #Cycles through ID'd dict columns
                cols = [x for x in df.columns if df[x].apply(lambda z: isinstance(z,dict)).any()]
                
                for c in cols:
                    
                    print(f'Parsing dicts in column {c}')
                    
                    try:
                        
                        print(f'JSON Normalizing {c}')

                        tdf=pd.json_normalize(df[c])
                        
                    except AttributeError as e:
                        
                        #This triggers if there are rows without inputs mixed with rows with dicts.  It will simply fill in nan rows with empty dicts, save for column names
                        
                        print(f'Got {e}.  Trying another way.')
                        
                        #This gets the column names
                        colnames = set([i for j in [k.keys() for k in df[c].dropna()] for i in j])
                        
                        #This parses with the new "insert" columns
                        tdf = pd.json_normalize(df[c].where(df[c].notna(), lambda d: {x:np.nan for x in colnames}))
        
                        pass

                    except:
                        
                        #This triggers for unsolvable errors parsing
                        print(f'Issue parsing {c}.  Skipping')
                        
                        bad_cols.append(c)
                        
                        continue
                        
                    try:
                        
                        #This adds parsed JSON columns to Dataframe
                        print(f'Joining {c} to DF')

                        df = pd.concat([tdf.rename(columns={x:f"{c}_{x}" for x in tdf.columns})\
                                    ,df.drop(columns=[c])], axis=1)\
                        .reset_index(drop=True)
                        
                    except:
                        
                        #This triggers for unsolvable errors parsing

                        print(f'Issue parsing {c}.  Skipping.')

                        bad_cols.append(c)

                        continue
    
    #Get rid of pesky '.' in column names            
    df.columns = [re.sub('[.]{1}','_',x) for x in df.columns]
    
    print('Writing to dataframe.')

    return(df)

In [23]:
auth_headers = eval(get_google_secret(secret_id, PROJECT_ID))
auth_endpoint = 'https://ghidorah.survey.ai/v1/oauth/token'

url_base = "https://ghidorah.survey.ai/v1/data/reports/"

#report_list = ['interactions/','evaluations/','coachings/']
#report_list = ['interactions/']
report_list = ['evaluations/']

params={
  "start_date": "2022-06-01T00:00:00.000+00:00",
  "end_date": "2022-06-02T00:00:00.000+00:00",
  "format": "JSON"
}

auth_token = authenticate(url=auth_endpoint, json=auth_headers)

token_dict = eval(auth_token.content.decode('utf-8'))

for r in report_list:
    
    print(f'Getting Report for {r}')
    
    endpoint = url_base+r
    
    print(f'Pinging Endpoint: {endpoint}')
    
    print(f'Getting Request ID')

    request_id = submit_report_request(endpoint=endpoint, params=params, token_dict=token_dict)
    
    print(f'Getting Report URI')

    response=get_report_job_uri(endpoint=endpoint, request_id=request_id,  token_dict=token_dict)
    
    print(f'Getting Report from URI')

    report = get_report_from_uri(response=response)
    
    print('Parsing.')

    df = fully_flatten_json(pd.json_normalize(report.json()))
    
    print('Sending to GBQ')

    df.to_gbq(destination_table=f'skunkworks.survey_data_{re.sub(r"/+","",r)}_report_test', project_id='project-1', if_exists='replace')





1.a Authenticating
1.b Status Code: 200
Getting Report for evaluations/
Pinging Endpoint: https://kong.observe.ai/v1/data/reports/evaluations/
Getting Request ID
2.a Posting request for https://kong.observe.ai/v1/data/reports/evaluations/
2.b Status Code: 400
similar job already exists with request id 9e9d06d0-b308-43ce-a5fa-ed384adfddc4
Getting Report URI
Getting Report from URI
Parsing.
Parsing lists in column value.evaluation_forms
Parsing dicts in column value.evaluation_forms
JSON Normalizing value.evaluation_forms
Joining value.evaluation_forms to DF
Parsing lists in column value.evaluation_forms_evaluations
Parsing dicts in column value.evaluation_forms_evaluations
JSON Normalizing value.evaluation_forms_evaluations
Joining value.evaluation_forms_evaluations to DF
Parsing lists in column value.evaluation_forms_template.sections
Parsing dicts in column value.evaluation_forms_template.sections
JSON Normalizing value.evaluation_forms_template.sections
Joining value.evaluation_forms