# Extracting data from csvs

In [1]:
%store -r sensorIds
print(sensorIds)

['18699', '18720', '18749']


In [2]:
#temporary reassigning of paths (since plume dashboard not working)

# we need two dictionaries (one for measurements and one for location)
measure_dictionary = {}
location_dictionary = {}

sensorPaths = []

for snum in sensorIds:
    sensorPaths.append("..\\data\\flow\\temp\\sensor_"  + snum)
print(sensorPaths)


['..\\data\\flow\\temp\\sensor_18699', '..\\data\\flow\\temp\\sensor_18720', '..\\data\\flow\\temp\\sensor_18749']


In [3]:
import pandas as pd
import numpy as np
import psycopg2
from datetime import datetime
from datetime import timezone
import os

In [4]:
csvMeasurementsPath = []
csvPostionsPath = []

for i in sensorPaths:
    with os.scandir(i) as listOfFiles:
        for currentFile in listOfFiles:
            # get all files that are csv
            if currentFile.is_file() and currentFile.name.endswith('csv'):
                 if "positions" not in currentFile.name:
                    csvMeasurementsPath.append(os.path.join(i,currentFile.name)) 
                 else:
                    csvPostionsPath.append(os.path.join(i,currentFile.name)) 

In [5]:
#map sensor id to filepaths
measurement_dictionary = {k:v for k,v in zip(sensorIds,csvMeasurementsPath)}
location_dictionary = {k:v for k,v in zip(sensorIds,csvPostionsPath)}
print(measurement_dictionary)
print(location_dictionary)

{'18699': '..\\data\\flow\\temp\\sensor_18699\\user_measures.csv', '18720': '..\\data\\flow\\temp\\sensor_18720\\user_measures.csv', '18749': '..\\data\\flow\\temp\\sensor_18749\\user_measures.csv'}
{'18699': '..\\data\\flow\\temp\\sensor_18699\\user_positions.csv', '18720': '..\\data\\flow\\temp\\sensor_18720\\user_positions.csv', '18749': '..\\data\\flow\\temp\\sensor_18749\\user_positions.csv'}


# ....

In [6]:
#bringing the csv to pandas dataframes
#put the dataframes into a dictionary with the sensor id as the key
for key in sensorIds:
    locations = {} #intialise empty dictionary to store each day of locations
    
    df_temp = pd.read_csv(location_dictionary[key],parse_dates=True, index_col="timestamp")

    #convert index from float to int
    df_temp.index = df_temp.index.astype(int,copy=False)

    
    # try using the dates which are already supplied. 
    # This strategy in the line below converts them and rounds down to date using 'd' flag
    # df_temp['day'] = pd.to_datetime(df_temp['date'], errors='coerce').dt.floor('d')
    # However, they are still datetimes so there is still a time - it's just been rounded down to zero, 
    # e.g. '2021-07-17T00:00:00.000000000'
    # This strategy (line below) really will keep just the date
    df_temp['day'] = pd.to_datetime(df_temp['date'], dayfirst=True, errors='coerce').dt.date

    the_unique_dates = df_temp['day'].unique()
    #print('Unique dates:',the_unique_dates)
    # this gives the same result as the for loop below
  
    #splitting the dataframe into separate days
    #for each day in unique dates set:
    for day in the_unique_dates:
        try:
            # In my code below I assign the subset of records to a new dataframe called dft
            # create 'midnight' timestamps
            timestampKey = int((pd.to_datetime(day, errors='coerce')).timestamp())

            # select the records for this day
            dft = df_temp[df_temp['day']==day]

            # #drop the date column to save space (we don't need this anymore)
            dft = dft.drop("date", axis=1)
            dft = dft.drop("day", axis=1)

            locations[timestampKey] = dft
        except KeyError as e:
            print(e)
    #assign new value to the locations dictionary, so it is associated in key-value pair with its coressponding sensorid
    location_dictionary[key] = locations


In [7]:
total = 0
for key in location_dictionary:
    for timestampKey in location_dictionary[key]:
      df_temp = location_dictionary[key][timestampKey]
      total += len(df_temp.index.values)
    print("sensor: "  + str(key) + "_total:" + str(total))
    total = 0 #reset the total

print("\n")

test = {k:v for k,v in zip(sensorIds,csvPostionsPath)}
for key in sensorIds:
    df_temp = pd.read_csv(test[key],parse_dates=True, index_col="timestamp")
    print("sensor: "  + str(key) + "_total:" + str(len(df_temp.index.values)))


sensor: 18699_total:5139
sensor: 18720_total:449
sensor: 18749_total:1113


sensor: 18699_total:5139
sensor: 18720_total:449
sensor: 18749_total:1113


In [8]:
#bringing the csv to pandas dataframes
#put the dataframes into a dictionary with the sensor id as the key
for key in sensorIds:

    measurements = {} #intialse empty dictioanry to store each day of measurements

    df_temp = pd.read_csv(measurement_dictionary[key],parse_dates=True, index_col="timestamp")

    #convert index from float to int
    df_temp.index = df_temp.index.astype(int,copy=False)

    #make a new day column to store the datetime.date of each record
    df_temp['day'] = pd.to_datetime(df_temp['date (UTC)'], dayfirst=True, errors='coerce').dt.date
   
   
    the_unique_dates = df_temp['day'].unique()

    #splitting the dataframe into separate days
    #for each day in unique dates set:
    for day in the_unique_dates:
        try:
            # In my code below I assign the subset of records to a new dataframe called dft
            # create 'midnight' timestamps
            timestampKey = int((pd.to_datetime(day, errors='coerce')).timestamp())

            # select the records for this day
            dft = df_temp[df_temp['day']==day]

            # #drop the date column to save space (we don't need this anymore)
            dft = dft.drop("date (UTC)", axis=1)
            dft = dft.drop("day", axis=1)
            
            measurements[timestampKey] = dft

        except KeyError as e:
            print(e)

    measurement_dictionary[key] = measurements #clear dataframe
    
    #print(df_temp['day'])


In [9]:
total = 0
for key in measurement_dictionary:
    for timestampKey in measurement_dictionary[key]:
        df_temp = measurement_dictionary[key][timestampKey]
        total += len(df_temp.index.values)
    print("sensor: "  + str(key) + "_total:" + str(total))
    total = 0 #reset the total

print("\n")

test = {k:v for k,v in zip(sensorIds,csvMeasurementsPath)}
for key in sensorIds:
    df_temp = pd.read_csv(test[key],parse_dates=True, index_col="timestamp")
    print("sensor: "  + str(key) + "_total:" + str(len(df_temp.index.values)))

sensor: 18699_total:25507
sensor: 18720_total:16520
sensor: 18749_total:27043


sensor: 18699_total:25507
sensor: 18720_total:16520
sensor: 18749_total:27043


# Preparing data for upload into PostGres

In [10]:
sensor_summaries = {}
sensor_data = {}

for key in sensorIds:

    ldf = mdf =  pd.DataFrame
    geometry_string = ""
    timestamp_sensor_key = ""

    #we can look into either dictioanry as they both share the same time range.
    for timestampKey in location_dictionary[key]:
   
        # concatenating numbers into text: 
        timestamp_sensor_key = "%s_%s" % (timestampKey, key)

        #try get location dataframe
        try:
            #location dataframe
            ldf = location_dictionary[key][timestampKey]

            #create bounding box polygon
            min_y= ldf['longitude'].min()
            max_y = ldf['longitude'].max()

            min_x= ldf['latitude'].min()
            max_x = ldf['latitude'].max()
            #POLYGON(minx miny, minx Maxy, maxx Maxy, maxx miny, minx miny)
            geometry_string = "POLYGON(({} {}, {} {}, {} {}, {} {},{} {}))".format(min_x,min_y,   min_x,max_y,   max_x,max_y,   max_x,min_y,   min_x,min_y)
        except Exception as e:
            print('The dataframe is empty therefore no bounding box will be applied :{0}'.format(e))
        
        # try get measurement dataframe
        try:
            #measurement dataframe
            mdf = measurement_dictionary[key][timestampKey] 
        except Exception as e:
            print('The measurement dataframe is empty. check csv files :{0}'.format(e))


        #summaryArray = [timestamp_start,sensor_id,bouding_box,measurement_count]
        summaryArray = [timestampKey,int(key),geometry_string,len(mdf.index.values)] #inserting row into temp array
        sensor_summaries[timestamp_sensor_key] = summaryArray    #assign new dataframe to coressponding key

        #dataArray = [id, mesaurement_json,location_json]
        dataArray = [mdf.to_json(orient="columns"),ldf.to_json(orient="columns")]     #dataArray = [mdf.to_json('./{}.json'.format(key),orient="columns"),ldf.to_json(orient="columns")]
        sensor_data[timestamp_sensor_key] = dataArray    #assign new dataframe to coressponding key    

In [11]:
for key in sensor_summaries:
    mdf = sensor_summaries[key]
    print(mdf)
    break

[1626480000, 18699, 'POLYGON((-1.908207 52.4491, -1.908207 52.586137, -1.839111 52.586137, -1.839111 52.4491,-1.908207 52.4491))', 1086]


In [12]:
# for key in sensor_data:
#     mdf = sensor_data[key]
#     print(type(mdf))
#     break

# Exporting to PostgresSQL 


# Writing records from a Dictionary of arrays to a SQL database
loop over all the keys and execute insert query

In [13]:
#Connecting to an existing database
con = psycopg2.connect(
    host="localhost",
    database="airQuality",
    user="Riyad", 
    password="123",
    # attempt to connect for 3 seconds then raise exception
    connect_timeout = 3)

In [14]:
#Opening a cursor to execute database operations
cursor = con.cursor()
query = "SELECT * FROM sensor_network.sensors"
#change dataframe to csv and save file
sensorsdf = pd.read_sql_query(query, con, index_col='plume_id')

cursor.close()

In [15]:
sensorsdf.head()

Unnamed: 0_level_0,sensor_serial_number,id,type_id
plume_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17539,02:00:00:00:43:8b,1,1
19115,02:00:00:00:49:b3,2,1
17521,02:00:00:00:43:79,3,1
16397,02:00:00:00:3f:15,4,1
16701,02:00:00:00:40:45,5,1


In [17]:
for key in sensor_summaries:

    #split key to get only sensorid
    s = key.split('_')[1]
    #get the new key from sensors table
    sensor_id = sensorsdf.loc[int(s)][1]

    mdf = sensor_summaries[key]
    ldf = sensor_data[key]

    try:
        #Opening a cursor to execute database operations
        cursor = con.cursor()

        #inserting sensor data and return the id of new record
        cursor.execute("INSERT INTO sensor_data.archive_measurements (measurements,locations) VALUES(%s, %s) \n RETURNING id", (str(ldf[0]),str(ldf[1])) )
        con.commit() 
        
        #set id of new record into local variable
        sensor_data_id = cursor.fetchone()[0]

        #inserting sensor summary
        cursor.execute("INSERT INTO sensor_network.sensor_summaries (timestamp_start, sensor_id, b_box, sensor_data_id, measurement_count) VALUES(%s, %s, %s, %s, %s)", (int(mdf[0]), int(sensor_id),str(mdf[2]),int(sensor_data_id),int(mdf[3])))
        con.commit() 

        cursor.close()
    #if table name does not exist exit loop 
    except(psycopg2.errors.UndefinedTable) as error:
        print('ERROR: ' + error)
        break

IndexError: tuple index out of range

In [None]:
#closing the connection
con.close()