# Extracting data from csvs

In [1]:
%store -r sensorIds
print(sensorIds)

['18699', '18720', '18749']


In [2]:
#temporary reassigning of paths (since plume dashboard not working)

# we need two dictionaries (one for measurements and one for location)
measure_dictionary = {}
location_dictionary = {}

sensorPaths = []

for snum in sensorIds:
    sensorPaths.append("..\\data\\flow\\temp\\sensor_"  + snum)
print(sensorPaths)


['..\\data\\flow\\temp\\sensor_18699', '..\\data\\flow\\temp\\sensor_18720', '..\\data\\flow\\temp\\sensor_18749']


In [3]:
import pandas as pd
import numpy as np
import psycopg2
from datetime import datetime
from datetime import timezone
import json
import os

In [80]:
csvMeasurementsPath = []
csvPostionsPath = []

for i in sensorPaths:
    with os.scandir(i) as listOfFiles:
        for currentFile in listOfFiles:
            # get all files that are csv
            if currentFile.is_file() and currentFile.name.endswith('csv'):
                 if "positions" not in currentFile.name:
                    csvMeasurementsPath.append(os.path.join(i,currentFile.name)) 
                 else:
                    csvPostionsPath.append(os.path.join(i,currentFile.name)) 

In [81]:
#map sensor id to filepaths
measure_dictionary = {k:v for k,v in zip(sensorIds,csvMeasurementsPath)}
location_dictionary = {k:v for k,v in zip(sensorIds,csvPostionsPath)}
print(measure_dictionary)
print(location_dictionary)

{'18699': '..\\data\\flow\\temp\\sensor_18699\\user_measures.csv', '18720': '..\\data\\flow\\temp\\sensor_18720\\user_measures.csv', '18749': '..\\data\\flow\\temp\\sensor_18749\\user_measures.csv'}
{'18699': '..\\data\\flow\\temp\\sensor_18699\\user_positions.csv', '18720': '..\\data\\flow\\temp\\sensor_18720\\user_positions.csv', '18749': '..\\data\\flow\\temp\\sensor_18749\\user_positions.csv'}


# Intialising the measurement dataframe

In [82]:
endConst= "00:01:00"    #intialising constant of midnight in UTC 
endConst= datetime.strptime(endConst,"%H:%M:%S").time()     #convert to time only (already in UTC)
# #MIDNIGHT TO MIDNIGHT SEARCH WILL NOT WORK ON LOCATIONS BECAUSE THEY ARE NOT TAKEN PER MINUTE

In [83]:
locations = {} #intialise empty dictionary to store each day of locations


#bringing the csv to pandas dataframes
#put the dataframes into a dictionary with the sensor id as the key
for key in sensorIds:
    location_dictionary[key] = pd.read_csv(location_dictionary[key],parse_dates=True, index_col="timestamp")

    df = location_dictionary[key]

    #dropping the date column 
    df.drop("date", axis=1, inplace=True)

    #get all the timestamps
    timestamps = df.index.values

    dayset = set() #holds each day

    for t in timestamps:
        t = int(t)
        dt = datetime.fromtimestamp(t, tz=timezone.utc).date()

        if dt not in dayset:
            dayset.add(dt)

   
    ################################################################

    #splitting dataframes into daily basis (UTC) midnight to midnight
    indexes = []
    
    #sort the days
    dayset = sorted(dayset)

    #convert and store timestamp ranges of each day
    for day in dayset:
        dt = datetime.combine(day, endConst)
        dt = int(dt.replace(tzinfo=timezone.utc).timestamp())
        indexes.append(dt)
        #indexes.append(day)

    timestamp_pair = [] # empty array to hold start and end timestamp
    days = [] #holds each timestamp pair as a day
    i = 0 #local counter

    while i < (len(indexes)-1):
        timestamp_pair.append(indexes[i]) 
        timestamp_pair.append(indexes[i+1])
        days.append(timestamp_pair)
        timestamp_pair = [] # empty the pair
        
        i += 1

        #if i is even then skip onto next pair
        if (i % 2 == 0):
          i += 1  
   

    #splitting the dataframe into separate days
    for pair in days:
        try:
            timestampKey = str(pair[0]) + "_" +  str(pair[1]) 
            #mask = (df.index > pair[0]) & (df.index <= pair[1])
            #df = df.loc[mask]
            df = df.loc[pair[0]:pair[1]]
            locations[timestampKey] = df
        except KeyError:
            print("find next time")
        #print(df)

    break


In [84]:
print(locations)

{'1626480060_1626566460':             latitude  longitude
timestamp                      
1626501291 -1.889811  52.452676
1626502670 -1.889395  52.452940
1626502734 -1.889774  52.452767
1626503572 -1.890229  52.452835
1626503578 -1.890589  52.453279
...              ...        ...
1626558816 -1.885437  52.450709
1626558945 -1.889935  52.452639
1626561645 -1.889937  52.452638
1626561651 -1.889754  52.452967
1626562826 -1.889937  52.452638

[1043 rows x 2 columns], '1626566460_1626652860': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1626739260_1626825660': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1626912060_1626998460': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1627084860_1627171260': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1627257660_1627344060': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1627430460_1627516860': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1627603260_1627689660': Empty Dat

In [79]:
df = location_dictionary[sensorIds[0]]
df.loc[int(1626566460):int(1626652860)]

Unnamed: 0_level_0,latitude,longitude
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
1626585426,-1.885194,52.450762
1626585456,-1.889596,52.452820
1626585546,-1.889945,52.452646
1626585834,-1.889599,52.452829
1626587195,-1.889951,52.452635
...,...,...
1626645141,-1.885044,52.450716
1626645267,-1.889617,52.452723
1626645502,-1.885044,52.450716
1626645628,-1.889604,52.452718


In [72]:
df = location_dictionary[sensorIds[0]]
df.head()

Unnamed: 0_level_0,latitude,longitude
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
1626501291,-1.889811,52.452676
1626502670,-1.889395,52.45294
1626502734,-1.889774,52.452767
1626503572,-1.890229,52.452835
1626503578,-1.890589,52.453279


In [193]:
#location_dictionary[key] = locations
    #print(locations)
    #locations = {} #clear dataframe after each iteration

    #break

{'1626476400_1626562800':             latitude  longitude
timestamp                      
1626476400 -1.836325  52.425768, '1626562800_1626649200': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1626735600_1626822000': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1626908400_1626994800': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1627081200_1627167600': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1627254000_1627340400': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1627426800_1627513200': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1627599600_1627686000': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1627772400_1627858800': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1627945200_1628031600': Empty DataFrame
Columns: [latitude, longitude]
Index: [], '1626130800_1626390000':             latitude  longitude
timestamp                      
1626159288 -1.896519  52.482877, '1626822

In [21]:
endConst= "00:01:00"    #intialising constant of midnight in UTC 
endConst= datetime.strptime(endConst,"%H:%M:%S").time()     #convert to time only (already in UTC)

In [31]:
print(location_dictionary[sensorIds[2]])

{1626480094:             latitude  longitude
timestamp                      
1626480094 -1.836324  52.425765
1626481017 -1.836324  52.425765}


In [21]:
endConst= "00:01:00"    #intialising constant of midnight in UTC 
endConst= datetime.strptime(endConst,"%H:%M:%S").time()     #convert to time only (already in UTC)

In [22]:
measurements = {} #intialse empty dictioanry to store each day of measurements

#bringing the csv to pandas dataframes
#put the dataframes into a dictionary with the sensor id as the key
for key in sensorIds:
    measure_dictionary[key] = pd.read_csv(measure_dictionary[key],parse_dates=True, index_col="timestamp")

    #dropping the date column 
    df = measure_dictionary[key]
    df.drop("date (UTC)", axis=1, inplace=True)

    #splitting dataframes into daily basis (UTC) midnight to midnight
    indexes = []

    #get all the timestamps
    timestamps = df.index.values

    for t in timestamps:
        t = int(t)
        dt = datetime.fromtimestamp(t, tz=timezone.utc).time()
  
        if (endConst > dt):
            indexes.append(t)
    

    timestamp_pair = [] # empty array
    days = []
    i= 0 

    while i < (len(indexes)-1):
        timestamp_pair.append(indexes[i]) 
        timestamp_pair.append(indexes[i+1])
        days.append(timestamp_pair)
        i += 1

        #if i is even then skip onto next pair
        if (i % 2 == 0):
          i += 1  

    #splitting the dataframe into separate days
    for pair in days:
        timestamp = pair[0]
        measurements[timestamp] = df.loc[pair[0]:pair[1]]
        
    measure_dictionary[key] = measurements

    measurements = {} #clear dataframe

In [9]:
for key in sensorIds:
    for l_dictionary in location_dictionary[key]:
        for timestampKey in l_dictionary:
            df = l_dictionary[timestampKey]
            print(df.head(1))
        
# min_long= df['longitude'].min()
# max_long = df['longitude'].max()

# min_lat= df['latitude'].min()
# max_lat = df['latitude'].max()

# print("min_longitude: " + str(min_long) + "\n max_logitude: " + str(max_long))
# print("min_latitude: " + str(min_lat) + "\n max_latitude: " + str(max_lat))

# Preparing data for upload into PostGres

In [12]:
sensor_summaries = {}
sensor_data = {}

geometry_string = ""

ldf = df
mdf = df

for timestampKey in location_dictionary[sensorIds[0]]:
    #location dataframe
    ldf = locations[timestampKey]

    #create bounding box polygon
    min_y= ldf['longitude'].min()
    max_y = ldf['longitude'].max()

    min_x= ldf['latitude'].min()
    max_x = ldf['latitude'].max()
    #POLYGON(minx miny, minx Maxy, maxx Maxy, maxx miny, minx miny)
    geometry_string = "POLYGON(({} {}, {} {}, {} {}, {} {},{} {}))".format(min_x,min_y,   min_x,max_y,   max_x,max_y,   max_x,min_y,   min_x,min_y)

print(geometry_string)
        
    # for timestampKey in measure_dictionary[key]:
        
    #     #create new key for sensor summaries
    #     timestamp_sensor_key = str(df.index[0]) + "_" + str(df.index[-1]) + "_" +  str(int(key))

    #     #measurement dataframe
    #     mdf = measurements[timestampKey]

    #     #summaryArray = [timestamp_start,timestamp_end,sensor_id,bouding_box,measurement_count]
    #     summaryArray = [mdf.index[0],mdf.index[-1],int(key),geometry_string,len(mdf.index.values)] #inserting row into temp array
    #     sensor_summaries[timestamp_sensor_key] = summaryArray    #assign new dataframe to coressponding key
        
    #     ##########print(sensor_summaries[timestamp_sensor_key])

    #     #dataArray = [id, mesaurement_json,location_json]
    #     dataArray = [mdf.to_json(orient="columns"),ldf.to_json(orient="columns")]     #dataArray = [mdf.to_json('./{}.json'.format(key),orient="columns"),ldf.to_json(orient="columns")]
    #     sensor_data[timestamp_sensor_key] = dataArray    #assign new dataframe to coressponding key    




In [10]:
# min_long= df['longitude'].min()
# max_long = df['longitude'].max()

# min_lat= df['latitude'].min()
# max_lat = df['latitude'].max()

# print("min_longitude: " + str(min_long) + "\n max_logitude: " + str(max_long))
# print("min_latitude: " + str(min_lat) + "\n max_latitude: " + str(max_lat))

In [11]:
# sensor_summaries = {}
# sensor_data = {}

# geometry_string = ""

# ldf = df
# mdf = df

# for key in sensorIds:
#     for timestampKey in location_dictionary[sensorIds[0]]:
#         #location dataframe
#         ldf = locations[timestampKey]

#         #create bounding box polygon
#         min_y= ldf['longitude'].min()
#         max_y = ldf['longitude'].max()

#         min_x= ldf['latitude'].min()
#         max_x = ldf['latitude'].max()
#         #POLYGON(minx miny, minx Maxy, maxx Maxy, maxx miny, minx miny)
#         geometry_string = "POLYGON(({} {}, {} {}, {} {}, {} {},{} {}))".format(min_x,min_y,   min_x,max_y,   max_x,max_y,   max_x,min_y,   min_x,min_y)

# print(geometry_string)
        
#     # for timestampKey in measure_dictionary[key]:
        
#     #     #create new key for sensor summaries
#     #     timestamp_sensor_key = str(df.index[0]) + "_" + str(df.index[-1]) + "_" +  str(int(key))

#     #     #measurement dataframe
#     #     mdf = measurements[timestampKey]

#     #     #summaryArray = [timestamp_start,timestamp_end,sensor_id,bouding_box,measurement_count]
#     #     summaryArray = [mdf.index[0],mdf.index[-1],int(key),geometry_string,len(mdf.index.values)] #inserting row into temp array
#     #     sensor_summaries[timestamp_sensor_key] = summaryArray    #assign new dataframe to coressponding key
        
#     #     ##########print(sensor_summaries[timestamp_sensor_key])

#     #     #dataArray = [id, mesaurement_json,location_json]
#     #     dataArray = [mdf.to_json(orient="columns"),ldf.to_json(orient="columns")]     #dataArray = [mdf.to_json('./{}.json'.format(key),orient="columns"),ldf.to_json(orient="columns")]
#     #     sensor_data[timestamp_sensor_key] = dataArray    #assign new dataframe to coressponding key    




In [11]:
# # reading the JSON data using json.loads(json string)
# # converting json dataset from dictionary to dataframe
# dict_data = json.loads(sensor_data[sensorIds[0]][0])
# data = pd.DataFrame.from_dict(dict_data, orient='columns')
# data.index.rename('timestamp', inplace=True)
# data.head()

In [21]:
for key in sensor_summaries:
    mdf = sensor_summaries[key]
    print(mdf)


[1626480043.0, 1626566444.0, 18699, '', 1434]
[1626480043.0, 1626566444.0, 18720, '', 1434]
[1626480043.0, 1626566444.0, 18749, '', 1434]


# Exporting to PostgresSQL 


# Writing records from a Dictionary of arrays to a SQL database
loop over all the keys and execute insert query

In [12]:
#Connecting to an existing database
con = psycopg2.connect(
    host="localhost",
    database="airQuality",
    user="Riyad", 
    password="123",
    # attempt to connect for 3 seconds then raise exception
    connect_timeout = 3)

In [13]:
#Opening a cursor to execute database operations
cursor = con.cursor()
query = "SELECT * FROM sensor_network.sensors"
#change dataframe to csv and save file
sensorsdf = pd.read_sql_query(query, con, index_col='plume_id')

cursor.close()

In [14]:
sensorsdf.head()

Unnamed: 0_level_0,sensor_serial_number,id,type_id
plume_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17539,02:00:00:00:43:8b,1,1
19115,02:00:00:00:49:b3,2,1
17521,02:00:00:00:43:79,3,1
16397,02:00:00:00:3f:15,4,1
16701,02:00:00:00:40:45,5,1


In [15]:
for key in sensorIds:
    #get the new key from sensors table
    sensor_id = sensorsdf.loc[int(key)][1]

    mdf = sensor_summaries[key]
    ldf = sensor_data[key]

    try:
        #Opening a cursor to execute database operations
        cursor = con.cursor()

        #inserting sensor data and return the id of new record
        cursor.execute("INSERT INTO sensor_data.archive_measurements (measurements,locations) VALUES(%s, %s) \n RETURNING id", (str(ldf[0]),str(ldf[1])) )
        con.commit() 
        
        #set id of new record into local variable
        sensor_data_id = cursor.fetchone()[0]

        #inserting sensor summary
        cursor.execute("INSERT INTO sensor_network.sensor_summaries (timestamp_start,timestamp_end, sensor_id, b_box, sensor_data_id, measurement_count) VALUES(%s, %s, %s, %s, %s, %s)", (int(mdf[0]), int(mdf[1]), int(sensor_id),str(mdf[3]),int(sensor_data_id),int(mdf[4])))
        con.commit() 

        cursor.close()
    #if table name does not exist exit loop 
    except(psycopg2.errors.UndefinedTable) as error:
        print('ERROR: ' + error)
        break

In [16]:
#closing the connection
con.close()