# This CSV version is not supported by zephyr API yet

In [1]:
import pandas as pd
import requests
from datetime import datetime
import numpy as np
import json
import wget # pip install wget

In [2]:
url = "https://data.earthsense.co.uk/dataForViewBySlots/AstonUniversity/Xo08R83d43e0Kk6/814/20210917000/202109200000/B/def/json/api"

In [3]:
import io,csv
from typing import List,Iterable

In [4]:
res = requests.get(url, stream=True)
if res.ok:
    file_ = io.BytesIO(res.content)
else:
     raise IOError()


In [5]:
buffer = io.StringIO(file_.read().decode('UTF-8'))

In [6]:
buffer

<_io.StringIO at 0x1496d141dc0>

In [7]:
class ZepyhrSensor:
    """Per sensor object designed to wrap the csv files returned by the Plume API.

    Example Usage:
        ps = PlumeSensor.from_csv("16397", open("sensor_measures_20211004_20211008_1.csv"))
        print(ps.DataFrame)
    """

    def __init__(self, id_, header: List, rows: List):
        self.id = id_
        self.header = header
        self.rows = rows

    def add_row(self, row: Iterable):
        """Normalise and append row to internal list.

        Coverts all digits to int objects, all elements are initially converted to strings before
        digit check to avoid type errors.

        :param row: row to add to plume sensor
        """
        self.rows.append([int(i) if str(i).isdigit() else i for i in row])
    
    @property
    def dataframe(self) -> pd.DataFrame:
        """Writes headers and rows into a DataFrame.
        """
        return pd.DataFrame(self.rows, columns=self.header)

In [8]:
sensor_id = '814'
reader = csv.reader(buffer, dialect=csv.unix_dialect)
header = next(reader)
sensor = ZepyhrSensor(sensor_id, header, [])
for row in reader:
    sensor.add_row(row)


In [9]:
sensor.dataframe.head()

Unnamed: 0,"{""slotB"": {""NO"": {""header"": {""key"": ""NO""","""group"": ""NO""","""label"": ""NO""","""units"": ""ug/m3""","""CSVOrder"": 6","""HTMLLabel"": ""NO""}","""data"": [0",0,2,0.1,...,"""</p>""]","""errors"": []","""timings"": {""generateCalibratedData"": 3.926924","""overall"": 4.848803","""generateCalibratedData_getRawData_fromDB"": 2.564961","""generateCalibratedData_runCDGs"": 0.531226","""generateCalibratedData_getRawData_process"": 0.752605","""processAndPackageCalibratedData"": 0.468759}","""datahash"": ""9f92745d""","""indent"": 0}}"


In [None]:
zephyr_username = "AstonUniversity"
zephyr_password = "Xo08R83d43e0Kk6"

In [None]:
url = "https://data.earthsense.co.uk/zephyrsForUser/%s/%s" % (zephyr_username, zephyr_password)
response = requests.get(url)

In [None]:
print(response.status_code)

In [None]:
def jprint(obj):
    # create a formatted string of the Python JSON object
    zepyhr_id = json.dumps(obj, sort_keys=True, indent=4)
    return zepyhr_id

In [None]:
pass_times = response.json()['usersZephyrs']['13883']['zNumber']
jprint(pass_times)

In [None]:
jsonObject = response.json()['usersZephyrs']
zephyr_id_list = []

for key in jsonObject:
    pass_times = response.json()['usersZephyrs'][key]['zNumber']
    zephyr_id_list.append(jprint(pass_times))
    
print(zephyr_id_list)

# Download and extraction 

In [None]:
start_datetime = "202108180000"
end_datetime = "202108200000"
slots = "B"
format_output = "csv"
target = "api"

In [None]:
for zephyr_id in zephyr_id_list:
    url = "https://data.earthsense.co.uk/dataForViewBySlots/%s/%s/%s/%s/%s/%s/def/%s/%s" % (zephyr_username, zephyr_password, zephyr_id,start_datetime,end_datetime,slots,format_output,target)

    #We can use the wget library to download the zip file.
    #We can then extract the file into a local "data" folder 
    #The final step is to identify the paths of the csv files we need.
    try: 
        target = wget.download(url, "..\data\zephyrs")
    except BaseException as error:
        print('An exception occurred: {}'.format(error))

In [None]:
zephyr_csv = []

for zephyr_id in zephyr_id_list:
    url = "https://data.earthsense.co.uk/dataForViewBySlots/%s/%s/%s/%s/%s/%s/def/%s/%s" % (zephyr_username, zephyr_password, zephyr_id,start_datetime,end_datetime,slots,format_output,target)
    response = requests.get(url)
    if response.status_code == 200:
        zephyr_jsons.append(response.json()['slotB'])
    else:
        print("HTTP request error code: " + response.status_code)

# json to dt

In [None]:
#extracting and preparing the dataframe from the json objects
dfList = []
for jsonObject in zephyr_jsons:
    df_temp = pd.DataFrame.from_records(jsonObject)
    df_temp.drop('header', axis =0, inplace=True)
    df_temp.drop('data_hash', axis =0, inplace=True)
    df_temp.drop('UTS', axis=1, inplace=True)

    #explode function transform each element of a list to a row.
    #we can apply it to all the columns assuming they have the same number of elements in each list 
    df_temp = df_temp.apply(pd.Series.explode)
    
    dfList.append(df_temp)

measurement_dictionary = {k:v for k,v in zip(zephyr_id_list,dfList)}
dfList = [] # clear dataframe list


# Splitting dataframe into days

In [None]:
def dataSplit(df_temp,dateString):
    
    data = {} #intialise empty dictionary to store each day of locations
    
    df_temp = measurement_dictionary[key]

    # using the dates which are already supplied. This strategy in the line below converts them and rounds down to date using 'd' flag
    # This strategy (line below) will keep just the date
    df_temp['day'] = pd.to_datetime(df_temp[dateString], dayfirst=True, errors='coerce').dt.date

    the_unique_dates = df_temp['day'].unique()
    #print('Unique dates:',the_unique_dates)
    # this gives the same result as the for loop below
    
    #splitting the dataframe into separate days
    #for each day in unique dates set:
    for day in the_unique_dates:
        try:
            # In my code below I assign the subset of records to a new dataframe called dft
            # create 'midnight' timestamps
            timestampKey = int((pd.to_datetime(day, errors='coerce')).timestamp())

            # select the records for this day
            dft = df_temp[df_temp['day']==day]

            # #drop the date column to save space (we don't need this anymore)
            #dft = dft.drop(dateString, axis=1)
            dft = dft.drop("day", axis=1)
            dft = dft.dropna() #drop null values
            
            data[timestampKey] = dft

        except KeyError as e:
            print(e)

    return data

In [None]:
for key in measurement_dictionary:
    measurement_dictionary[key] = dataSplit(measurement_dictionary[key],'dateTime')

location_dictionary = measurement_dictionary 

# Separating each list 

In [None]:
def calculateTimestamps(df):
    #set the index to Timestamp and automatically drops the datetime index
    df['Timestamp'] = df.index.values.astype(np.int64) // 10 ** 9
    df.set_index('Timestamp',inplace=True)
    return df

In [None]:
def dataframeAverages(intervalString,df):
    #convert all column types to int
    df = df.astype(int)
    #resample by minute and get mean
    df= df.resample(intervalString).mean() 

    df.isnull().sum() # identify any null values
    df.dropna(inplace=True) #drop null values
    #df = df.astype(int)  #convert avergaes back to int
    return df

In [None]:
for key in measurement_dictionary:
    for timestamp in measurement_dictionary[key]:
        df_temp = measurement_dictionary[key][timestamp]
        df_temp.set_index('dateTime',inplace=True)
        df_temp.index = pd.to_datetime(df_temp.index,dayfirst=True)

        #extracting the location data
        ldf = df_temp[['latitude','longitude']]
        df_temp.drop(['latitude','longitude'],axis=1, inplace=True) #drop the lcoation columns 

        #calculate minute averages
        #df_temp = dataframeAverages('1min',df_temp)
        #df_temp = calculateTimestamps(df_temp)

        #reassign new location dataframe to dictionary
        location_dictionary[key][timestamp] = ldf

        #reassign new measurement dataframe to dictionary
        measurement_dictionary[key][timestamp] = df_temp

In [None]:
for key in measurement_dictionary:
    for timestamp in measurement_dictionary[key]:
        df_temp = measurement_dictionary[key][timestamp]
    #break

df_temp.head()

# looking at the data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import numpy as np
import matplotlib.dates as mdates

In [None]:
df.head()

In [None]:
df = dataframeAverages('60min',df_temp)

myTimeFormat = mdates.DateFormatter('%D:%H:%M')

plt.style.use('fivethirtyeight')
fig = plt.figure(figsize = (24,20))
axs = fig.add_suplot(111)

axs.set_title('PM partiuclate hourly average')
axs.plot(df.index, df["particulatePM1"], color = (0.4,0.4,0.4), linewidth = 4, alpha = .9, label = 'pm 1')
axs.plot(df.index, df["particulatePM25"], color = (0.6,0.6,0.9), linewidth = 4, alpha = .9, label = 'pm 2.5')
#axs.plot(df.index, df["particulatePM10"], color = (0.9,0.6,0.6), linewidth = 4, alpha = .9, label = 'pm 10')
axs.set_xlabel("Time")
axs.set_ylabel("particualte matter (ug/m3)")
axs.xaxis.set_major_formatter(myTimeFormat)
axs.legend()

plt.show()

# locations

In [None]:
def calculateSensorMovement(ldf):
    precision = 0.00025 #this precision constant is derived from the GPS min and max readings which have 0.0003+/- variance for a stationary sensor 

    full_location_storage = False

    #check if the sensor has moved greater than a speicifed precision. 
    if ((ldf['latitude'].min() + precision) > ldf['latitude'].max()) or ((ldf['latitude'].max() - precision) < ldf['latitude'].min()):
        full_location_storage = True
    elif ((ldf['longitude'].min() + precision) > ldf['longitude'].max()) or ((ldf['longitude'].max() - precision) < ldf['longitude'].min()):
        full_location_storage = True
    else:
        full_location_storage = False

    return full_location_storage

In [None]:
def calculateBoundingBox(ldf):
    min_y= ldf['longitude'].min()
    max_y = ldf['longitude'].max()

    min_x= ldf['latitude'].min()
    max_x = ldf['latitude'].max()
    geometry_string = "POLYGON(({} {}, {} {}, {} {}, {} {},{} {}))".format(min_x,min_y,   min_x,max_y,   max_x,max_y,   max_x,min_y,   min_x,min_y)

In [None]:
data = {}
hoursummary = []

ldf = ldf.resample('60min').agg(['min','max']) #resample only works with datetimes
ldf['Timestamp'] = ldf.index.values.astype(np.int64) // 10 ** 9
ldf.set_index('Timestamp',inplace=True)

    #extracting a bounding box from hourly readings
try:
    for row in ldf.iterrows():
        geometry_string = ""

        tempArray = [] 
        min_x= row[1][0]    # lat
        max_x = row[1][1]   # lat
                
        min_y= row[1][2]    # long
        max_y = row[1][3]   # long

        geometry_string = "POLYGON(({} {}, {} {}, {} {}, {} {},{} {}))".format(min_x,min_y,   min_x,max_y,   max_x,max_y,   max_x,min_y,   min_x,min_y)

        tempArray = [geometry_string,len(ldf.index.values)] #inserting row into temp array
    hoursummary.append(tempArray)

except Exception as e:
    print('The dataframe is empty therefore no bounding box will be applied :{0}'.format(e))

In [None]:
print(hoursummary[0])
#ldf.head()