In [1]:
import pandas as pd
import requests
import json
from datetime import datetime
import numpy as np

In [2]:
zephyr_username = "AstonUniversity"
zephyr_password = "Xo08R83d43e0Kk6"

In [3]:
url = "https://data.earthsense.co.uk/zephyrsForUser/%s/%s" % (zephyr_username, zephyr_password)
response = requests.get(url)

In [4]:
print(response.status_code)

200


In [5]:
def jprint(obj):
    # create a formatted string of the Python JSON object
    zepyhr_id = json.dumps(obj, sort_keys=True, indent=4)
    return zepyhr_id

In [6]:
pass_times = response.json()['usersZephyrs']['13883']['zNumber']
jprint(pass_times)

'814'

In [7]:
jsonObject = response.json()['usersZephyrs']
zephyr_id_list = []

for key in jsonObject:
    pass_times = response.json()['usersZephyrs'][key]['zNumber']
    zephyr_id_list.append(jprint(pass_times))
    
print(zephyr_id_list)

['814', '821']


In [8]:
start_datetime = "202108180000"
end_datetime = "202108210000"
slots = "B"
format_output = "json"
target = "api"

In [9]:
zephyr_jsons = []

for zephyr_id in zephyr_id_list:
    url = "https://data.earthsense.co.uk/dataForViewBySlots/%s/%s/%s/%s/%s/%s/def/%s/%s" % (zephyr_username, zephyr_password, zephyr_id,start_datetime,end_datetime,slots,format_output,target)
    response = requests.get(url)
    if response.status_code == 200:
        zephyr_jsons.append(response.json()['slotB'])
    else:
        print("HTTP request error code: " + response.status_code)

# json to dt

In [10]:
#extracting and preparing the dataframe from the json objects
dfList = []
for jsonObject in zephyr_jsons:
    df_temp = pd.DataFrame.from_records(jsonObject)
    df_temp.drop('header', axis =0, inplace=True)
    df_temp.drop('data_hash', axis =0, inplace=True)
    df_temp.drop('UTS', axis=1, inplace=True)

    #explode function transform each element of a list to a row.
    #we can apply it to all the columns assuming they have the same number of elements in each list 
    df_temp = df_temp.apply(pd.Series.explode)
    
    dfList.append(df_temp)

measurement_dictionary = {k:v for k,v in zip(zephyr_id_list,dfList)}
dfList = [] # clear dataframe list


# Splitting dataframe into days

In [11]:
def dataSplit(df_temp,dateString):
    
    data = {} #intialise empty dictionary to store each day of locations
    
    df_temp = measurement_dictionary[key]

    # using the dates which are already supplied. This strategy in the line below converts them and rounds down to date using 'd' flag
    # This strategy (line below) will keep just the date
    df_temp['day'] = pd.to_datetime(df_temp[dateString], dayfirst=True, errors='coerce').dt.date

    the_unique_dates = df_temp['day'].unique()
    # this gives the same result as the for loop below
    
    #splitting the dataframe into separate days
    #for each day in unique dates set:
    for day in the_unique_dates:
        try:
            # In my code below I assign the subset of records to a new dataframe called dft
            # create 'midnight' timestamps
            timestampKey = int((pd.to_datetime(day, errors='coerce')).timestamp())

            # select the records for this day
            dft = df_temp[df_temp['day']==day]

            # #drop the date column to save space (we don't need this anymore)
            #dft = dft.drop(dateString, axis=1)
            dft = dft.drop("day", axis=1)
            dft = dft.dropna() #drop null values
            
            data[timestampKey] = dft

        except KeyError as e:
            print(e)

    return data

In [12]:
for key in measurement_dictionary:
    measurement_dictionary[key] = dataSplit(measurement_dictionary[key],'dateTime')

#location_dictionary = measurement_dictionary 
location_dictionary = {}

# Separating each list 

In [13]:
def calculateTimestamps(df):
    #set the index to Timestamp and automatically drops the datetime index
    df['Timestamp'] = df.index.values.astype(np.int64) // 10 ** 9
    df.set_index('Timestamp',inplace=True)
    return df

In [14]:
def dataframeAverages(intervalString,df):
    #convert all column types to int
    df = df.astype(int)
    #resample by minute and get mean
    df= df.resample(intervalString).mean() 

    df.isnull().sum() # identify any null values
    df.dropna(inplace=True) #drop null values
    #df = df.astype(int)  #convert avergaes back to int
    return df

In [15]:
for key in measurement_dictionary:
    temp_locations = {}
    temp_measurements = {}
    for timestamp in measurement_dictionary[key]:
        df_temp = measurement_dictionary[key][timestamp]
        df_temp.set_index('dateTime',inplace=True)
        df_temp.index = pd.to_datetime(df_temp.index,dayfirst=True)

        #extracting the location data
        ldf = df_temp[['latitude','longitude']]
        df_temp.drop(['latitude','longitude'],axis=1, inplace=True) #drop the location columns 

        #calculate minute averages
        df_temp = dataframeAverages('1min',df_temp)
        df_temp = calculateTimestamps(df_temp)

        #reassign new location dataframe to dictionary
        temp_locations[timestamp] = ldf

        #reassign new measurement dataframe to dictionary
        temp_measurements[timestamp] = df_temp

    location_dictionary[key] = temp_locations
    measurement_dictionary[key] = temp_measurements

    temp_locations = {} # clear dictionary 
    temp_measurements = {} # clear dictionary

# Preparing data for postgres

In [16]:
# for key in location_dictionary:
#     for timestamp in location_dictionary[key]:
#         #ldf = temp_locations[timestamp]
#         ldf = location_dictionary[key][timestamp]
#     break
# ldf.head()

In [17]:
def calculateSensorMovement(ldf):
    precision = 0.00025 #this precision constant is derived from the GPS min and max readings which have 0.0003+/- variance for a stationary sensor 

    full_location_storage = False

    #check if the sensor has moved greater than a speicfied precision. 
    if ((ldf['latitude'].min() + precision) > ldf['latitude'].max()) or ((ldf['latitude'].max() - precision) < ldf['latitude'].min()):
        full_location_storage = True
    elif ((ldf['longitude'].min() + precision) > ldf['longitude'].max()) or ((ldf['longitude'].max() - precision) < ldf['longitude'].min()):
        full_location_storage = True
    else:
        full_location_storage = False

    return full_location_storage

In [18]:
def calculateBoundingBox(ldf):
    for row in ldf.iterrows():
        geometry_string = ""

        tempArray = [] 
        min_x= row[1][2]    # lat
        max_x = row[1][3]   # lat
                
        min_y= row[1][0]    # long
        max_y = row[1][1]   # long

    return "POLYGON((%f %f, %f %f, %f %f, %f %f,%f %f))" % (min_x,min_y,   min_x,max_y,   max_x,max_y,   max_x,min_y,   min_x,min_y)

In [19]:
sensor_summaries = {}
sensor_data = {}

for key in zephyr_id_list:
    
    df = mdf =  pd.DataFrame
    geometry_string = ""
    timestamp_sensor_key = ""

    #we can look into either dictioanry as they both share the same time range.
    for timestampKey in location_dictionary[key]:
   
        # concatenating numbers into text: 
        timestamp_sensor_key = "%s_%s" % (timestampKey, key)

        #try get location dataframe
        try:
            ldf = location_dictionary[key][timestampKey]
            
            #check if there is any movment in this current day of data and generate a bounding box if there is 
            if calculateSensorMovement(ldf) == True:
                print("Storing all measurements for this moving sensor")
                geometry_string = calculateBoundingBox(ldf)
            
            #if no movement is recorded them we can just take the min and max measuremnt for the day and drop the other values
            else:               
                temp_ldf = ldf.resample('D').agg(['min','max']) #resample on day interval
                geometry_string = calculateBoundingBox(temp_ldf)
               
                #convert all column types to float to get an averaged location
                ldf = ldf.astype(float)
                ldf = ldf.resample('D').mean()
                ldf = calculateTimestamps(ldf)
               
        except Exception as e:
            print('The dataframe is empty therefore no bounding box will be applied :{}'.format(e))
        
        # try get measurement dataframe
        try:
            mdf = measurement_dictionary[key][timestampKey] 
        except Exception as e:
            print('The measurement dataframe is empty. check csv files :{}'.format(e))


        #summaryArray = [timestamp_start,sensor_id,bouding_box,measurement_count]
        summaryArray = [timestampKey,int(key),geometry_string,len(mdf.index.values)] #inserting row into temp array
        sensor_summaries[timestamp_sensor_key] = summaryArray    #assign new dataframe to coressponding key

        #dataArray = [id, mesaurement_json,location_json]
        dataArray = [mdf.to_json(orient="columns"),ldf.to_json(orient="columns")] 
        sensor_data[timestamp_sensor_key] = dataArray    #assign new dataframe to coressponding key    

# Exporting to PostgresSQL 


# Writing records from a Dictionary of arrays to a SQL database
loop over all the keys and execute insert query

In [99]:
import psycopg2

In [100]:
#Connecting to an existing database
con = psycopg2.connect(
    host="localhost",
    database="airQuality",
    user="Riyad", 
    password="123",
    # attempt to connect for 3 seconds then raise exception
    connect_timeout = 3)

In [101]:
#Opening a cursor to execute database operations
cursor = con.cursor()
query = "SELECT * FROM sensor_network.sensors"
#change dataframe to csv and save file
sensorsdf = pd.read_sql_query(query, con, index_col='zephyr_id')
sensorsdf = sensorsdf.convert_dtypes() #convert to correct types
cursor.close()

In [102]:
sensorsdf.head()

Unnamed: 0_level_0,plume_id,sensor_serial_number,id,type_id
zephyr_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,17539,02:00:00:00:43:8b,1,1
,19115,02:00:00:00:49:b3,2,1
,17521,02:00:00:00:43:79,3,1
,16397,02:00:00:00:3f:15,4,1
,16701,02:00:00:00:40:45,5,1


In [103]:
for key in sensor_summaries:

    #split key to get only sensorid
    s = key.split('_')[1]
    #get the new key from sensors table
    #sensor_id = sensorsdf.loc[int(s)]
    break

print(sensorsdf.loc[int(s)]['id'])

31


In [104]:
for key in sensor_summaries:

    #split key to get only sensorid
    s = key.split('_')[1]
    #get the new key from sensors table
    sensor_id = sensorsdf.loc[int(s)]['id']

    mdf = sensor_summaries[key]
    ldf = sensor_data[key]

    try:
        #Opening a cursor to execute database operations
        cursor = con.cursor()

        #inserting sensor data and return the id of new record
        cursor.execute("INSERT INTO sensor_data.archive_measurements (measurements,locations) VALUES(%s, %s) \n RETURNING id", (str(ldf[0]),str(ldf[1])) )
        con.commit() 
        
        #set id of new record into local variable
        sensor_data_id = cursor.fetchone()[0]

        #inserting sensor summary
        cursor.execute("INSERT INTO sensor_network.sensor_summaries (timestamp_start, sensor_id, b_box, sensor_data_id, measurement_count) VALUES(%s, %s, %s, %s, %s)", (int(mdf[0]), int(sensor_id),str(mdf[2]),int(sensor_data_id),int(mdf[3])))
        con.commit() 

        cursor.close()
    #if table name does not exist exit loop 
    except(psycopg2.errors.UndefinedTable) as error:
        print('ERROR: ' + error)
        break


In [105]:
#closing the connection
con.close()