# Data Query (specific groupMetadataID)


#### **ONLY RUN IF MASSIVE AMOUNTS OF DATA IS YOUR GOAL**

Runs a query for the chosen groupMetadataID, retrieving chassis and best_pose data.
Then runs a query for metadata.


In [1]:
import plotly.express as px
import pandas as pd
import numpy as np
import os

# Import plotly.express to utilize map

### Import your groupMetadataIDs below and choose to query data or not


In [12]:
# select a groupMetadataID
gmIDs = pd.read_csv("gmIDs.csv")["groupMetadataID"]
print(len(gmIDs))

# set to False if data is stored as .csv file.
# otherwise, set to True
queryPose = True
queryChassis = True
queryMeta = True

59


In [13]:
from boto3.dynamodb.conditions import Key, Attr
import boto3

if queryPose or queryChassis or queryMeta:
    # Get the service resource
    dynamodb = boto3.resource('dynamodb')
    # Instantiate a table resource object
    table = dynamodb.Table('ads_passenger_processed')
    table_meta = dynamodb.Table('ads_passenger_processed_metadata')

### Query and save Best_pose data


In [25]:
if queryPose:
    if not os.path.exists(f"./data/best_pose"):
        os.makedirs(f"./data/best_pose")

    for gmID in gmIDs:
        if not os.path.exists(f"./data/best_pose/{gmID}.csv"):
            print(f"querying best_pose for {gmID}")

        else:
            print(f"skipping {gmID}, data already exists")
            continue

        # query arguments for best_pose data
        keywords = dict(
            IndexName='topic-index',
            KeyConditionExpression=Key('topic').eq(
                '/apollo/sensor/gnss/best_pose'),
            ProjectionExpression="groupMetadataID, #t, solStatus, solType, latitudeStdDev, longitudeStdDev, numSatsTracked, numSatsMulti, numSatsInSolution, differentialAge, solutionAge, latitude, longitude",
            Limit=1500,
            ExpressionAttributeNames={'#t': 'time'},
            FilterExpression=Key('groupMetadataID').eq(f'{gmID}')

        )

        # run initial query and coerce results into a dataframe
        res = table.query(**keywords)
        df_pose = pd.DataFrame.from_dict(
            pd.json_normalize(res['Items']), orient='columns')

        done = False
        leek = res["LastEvaluatedKey"]

        while not done:
            try:
                leek = res["LastEvaluatedKey"]
                keywords["ExclusiveStartKey"] = leek
                res = table.query(**keywords)
                tmpDF = pd.DataFrame.from_dict(
                    pd.json_normalize(res['Items']), orient='columns')
                df_pose = pd.concat([df_pose, tmpDF])
            except KeyError:
                print("Done querying best_pose data")
                done = True

        df_pose.to_csv(f"./data/best_pose/{gmID}.csv", index=False)
        print(f"Saved best_pose data at ./data/best_pose/{gmID}.csv\n")

print("\nfinished")

querying best_pose for 2bc6ebb8-a529-11ee-88ec-eb6a8d5269b4
Done querying best_pose data
Saved best_pose data at ./data/best_pose/2bc6ebb8-a529-11ee-88ec-eb6a8d5269b4.csv

querying best_pose for 8fa6fe80-c869-11ee-a7fc-dd032dba19e8
Done querying best_pose data
Saved best_pose data at ./data/best_pose/8fa6fe80-c869-11ee-a7fc-dd032dba19e8.csv

querying best_pose for 837fc882-cb5a-11ee-909c-e1dc60cf66f9
Done querying best_pose data
Saved best_pose data at ./data/best_pose/837fc882-cb5a-11ee-909c-e1dc60cf66f9.csv

querying best_pose for 3d8020aa-cb7f-11ee-909c-e1dc60cf66f9
Done querying best_pose data
Saved best_pose data at ./data/best_pose/3d8020aa-cb7f-11ee-909c-e1dc60cf66f9.csv

querying best_pose for 559495ca-d270-11ee-b437-336917683bb8
Done querying best_pose data
Saved best_pose data at ./data/best_pose/559495ca-d270-11ee-b437-336917683bb8.csv

querying best_pose for 47561998-d9c3-11ee-a158-97f8443fd730
Done querying best_pose data
Saved best_pose data at ./data/best_pose/47561998-d

### Query and save Chassis data


Warning: This query can take awhile. frequently over 5 minutes


In [15]:
if queryChassis:
    if not os.path.exists(f"./data/chassis"):
        os.makedirs(f"./data/chassis")

    for gmID in gmIDs:
        if not os.path.exists(f"./data/chassis/{gmID}.csv"):
            print(f"querying chassis for {gmID}")

        else:
            print(f"skipping {gmID}, data already exists")
            continue

        keywords = dict(
            IndexName='topic-index',
            KeyConditionExpression=Key('groupMetadataID').eq(gmID),
            Limit=2000,
            FilterExpression=Key('topic').eq(
                '/apollo/canbus/chassis')
        )

        res = table.query(**keywords)
        df_chassis = pd.DataFrame.from_dict(
            pd.json_normalize(res['Items']), orient='columns')

        done = False
        leek = res["LastEvaluatedKey"]

        while not done:
            try:
                leek = res["LastEvaluatedKey"]
                keywords["ExclusiveStartKey"] = leek
                res = table.query(**keywords)
                tmpDF = pd.DataFrame.from_dict(
                    pd.json_normalize(res['Items']), orient='columns')
                df_chassis = pd.concat([df_chassis, tmpDF])
                # print(leek)

            except KeyError:
                print(f"done querying for chassis data")
                done = True

        df_chassis.to_csv(f"./data/chassis/{gmID}.csv", index=False)
        print(f"Saved chassis data at ./data/chassis/{gmID}.csv")
print("\nfinished")

skipping 2bc6ebb8-a529-11ee-88ec-eb6a8d5269b4, data already exists
skipping 8fa6fe80-c869-11ee-a7fc-dd032dba19e8, data already exists
skipping 837fc882-cb5a-11ee-909c-e1dc60cf66f9, data already exists
skipping 3d8020aa-cb7f-11ee-909c-e1dc60cf66f9, data already exists
skipping 559495ca-d270-11ee-b437-336917683bb8, data already exists
skipping 47561998-d9c3-11ee-a158-97f8443fd730, data already exists
skipping 25641404-cb66-11ee-909c-e1dc60cf66f9, data already exists
skipping 286c70cc-d2f7-11ee-b437-336917683bb8, data already exists
skipping c9c6856c-d33c-11ee-b437-336917683bb8, data already exists
skipping 7f09f6c6-a5b0-11ee-88ec-eb6a8d5269b4, data already exists
skipping d12cd1c4-caec-11ee-909c-e1dc60cf66f9, data already exists
skipping 3c415ade-d353-11ee-b437-336917683bb8, data already exists
skipping 3a2a78cc-db21-11ee-a158-97f8443fd730, data already exists
querying chassis for e6d7d384-db40-11ee-a158-97f8443fd730


KeyboardInterrupt: 

### Query and save Metadata


In [23]:
if queryMeta:

    if not os.path.exists(f"./data/metadata"):
        os.makedirs(f"./data/metadata")

    for gmID in gmIDs:
        if not os.path.exists(f"./data/metadata/{gmID}.csv"):
            print(f"querying metadata for {gmID}")

        else:
            print(f"skipping {gmID}, data already exists")
            continue

        res = table_meta.query(
            IndexName='groupMetadataID-index',
            KeyConditionExpression=Key("groupMetadataID").eq(gmID),
            # ProjectionExpression="groupMetadataID, #o.Weather, #o.#m, #o.Notes",
            Limit=2000,
            # ExpressionAttributeNames={"#o": "other", "#m": "Map"},
        )

        df_meta = pd.DataFrame.from_dict(
            pd.json_normalize(res['Items']), orient='columns')

        print(df_meta.shape)

        df_meta.to_csv(f"./data/metadata/{gmID}.csv", index=False)
        print(f"Saved metadata data at ./data/metadata/{gmID}.csv")

querying metadata for 2bc6ebb8-a529-11ee-88ec-eb6a8d5269b4
(6, 17)
Saved metadata data at ./data/metadata/2bc6ebb8-a529-11ee-88ec-eb6a8d5269b4.csv
querying metadata for 8fa6fe80-c869-11ee-a7fc-dd032dba19e8
(10, 17)
Saved metadata data at ./data/metadata/8fa6fe80-c869-11ee-a7fc-dd032dba19e8.csv
querying metadata for 837fc882-cb5a-11ee-909c-e1dc60cf66f9
(16, 17)
Saved metadata data at ./data/metadata/837fc882-cb5a-11ee-909c-e1dc60cf66f9.csv
querying metadata for 3d8020aa-cb7f-11ee-909c-e1dc60cf66f9
(19, 17)
Saved metadata data at ./data/metadata/3d8020aa-cb7f-11ee-909c-e1dc60cf66f9.csv
querying metadata for 559495ca-d270-11ee-b437-336917683bb8
(11, 17)
Saved metadata data at ./data/metadata/559495ca-d270-11ee-b437-336917683bb8.csv
querying metadata for 47561998-d9c3-11ee-a158-97f8443fd730
(37, 22)
Saved metadata data at ./data/metadata/47561998-d9c3-11ee-a158-97f8443fd730.csv
querying metadata for 25641404-cb66-11ee-909c-e1dc60cf66f9
(17, 17)
Saved metadata data at ./data/metadata/256414