# Importing and locating csv files from a specified directory

In [30]:
#read in varaibles from other notebook
%store -r sensorPaths
print(sensorPaths)

{'16397': '..\\..\\data\\flow\\task_206_1629981045\\sensor_16397\\sensor_measures_20210805_20210824_1.csv', '17539': '..\\..\\data\\flow\\task_206_1629981045\\sensor_17539\\sensor_measures_20210806_20210824_1.csv', '18699': '..\\..\\data\\flow\\task_206_1629981045\\sensor_18699\\sensor_measures_20210731_20210824_1.csv', '1239564': '..\\..\\data\\flow\\task_206_1629981045\\sensor_1239564\\user_measures_20210805_20210824_1.csv', '1239791': '..\\..\\data\\flow\\task_206_1629981045\\sensor_1239791\\user_measures_20210806_20210824_1.csv'}


# Intialising the measurement dataframes

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates
import math

from datetime import timezone
import datetime

import plotly.graph_objects as go

In [32]:
def dataSplit(csvpath,dateString):

    data = {} #intialise empty dictionary to store each day of records

    df_temp = pd.read_csv(csvpath,parse_dates=True, index_col="timestamp")

    #dropping duplicates only used for individual exports which sometimes have duplicate values. 
    df_temp.drop_duplicates(keep='last', inplace= True)

    #convert index from float to int
    df_temp.index = df_temp.index.astype(int,copy=False)

    # using the dates which are already supplied. This strategy in the line below converts them and rounds down to date using 'd' flag
    # This strategy (line below) will keep just the date
    df_temp['day'] = pd.to_datetime(df_temp[dateString], dayfirst=True, errors='coerce').dt.date

    the_unique_dates = df_temp['day'].unique()
    #print('Unique dates:',the_unique_dates)
    # this gives the same result as the for loop below
  
    #splitting the dataframe into separate days
    #for each day in unique dates set:
    for day in the_unique_dates:
        try:
            # In my code below I assign the subset of records to a new dataframe called dft
            # create 'midnight' timestamps
            timestampKey = int((pd.to_datetime(day, errors='coerce')).timestamp())

            # select the records for this day
            dft = df_temp[df_temp['day']==day]

            # #drop the date column to save space (we don't need this anymore)
            #dft = dft.drop(dateString, axis=1)
            dft = dft.drop("day", axis=1)
            
            #resampling data requires datetime index, 
            #therefore we must cast the date column to DateTimeIndex type since it is dtype is currently object
            dft['date'] = pd.to_datetime(dft['date'])
            dft = dft.set_index([dateString])
            dft = dft.resample('1min').bfill() #resample data to every minute
            
            data[timestampKey] = dft

        except KeyError as e:
            print(e)

    return data

In [33]:
measurement_dictionary = {}
#bringing the csv to pandas dataframes
for key in sensorPaths:
    measurement_dictionary[key] = dataSplit(sensorPaths[key],"date")

# Extracting data we want for visualisation

Some machines struggle with large datasets as this requires high RAM usage. Therefore we should subset the data based on what the machine is capable of processing.

In [34]:
def get_combined_dataframe(startTime,endTime):

    #convert dates to timestamps
    startTime = int(startTime.replace(tzinfo=timezone.utc).timestamp())
    endTime = int(endTime.replace(tzinfo=timezone.utc).timestamp()) 

    df_concat = pd.DataFrame #intialise empty dataframe
    df_measurement_List = []
    df_concat_List = []

    for key in measurement_dictionary:
        for timestampKey in measurement_dictionary[key]:
            #Get all the data within the timeframe
            if(timestampKey >= startTime and timestampKey <= endTime):
                df = measurement_dictionary[key][timestampKey]
                df_measurement_List.append(df)

        #put each sensor concatenated data into a list
        df_concat_List.append(pd.concat(df_measurement_List))
        df_measurement_List = []

    #put each sensor data list into one concatenated dataframe
    df_concat = pd.concat(df_concat_List, keys= measurement_dictionary.keys())

    #sorting the index so that data can be ocrretly plotted
    df_concat.sort_index(inplace=True)

    #clear lists
    df_measurement_List = [] 
    df_concat_List = []

    return df_concat

In [35]:
## first week of data
#timestampStart = datetime.datetime(2021, 8, 6)
#timestampEnd = datetime.datetime(2021, 8, 13)

# second week of data
# timestampStart = datetime.datetime(2021, 8, 14)
# timestampEnd = datetime.datetime(2021, 8, 22)

#mortimer forest 
timestampStart = datetime.datetime(2021, 8, 19)
timestampEnd = datetime.datetime(2021, 8, 20)


df = pd.DataFrame #intialise empty dataframe

df = get_combined_dataframe(timestampStart,timestampEnd)


# Data Visualisation N02

# Time Series graphs for no2

# Calculating the standard deviation in time series 

## Working out standard deviation and mean

In [36]:
def get_stats(dft,column):
    df_temp_List = []
    for key in measurement_dictionary:
        df = dft.loc[str(key)][column]

        df = df.to_frame()
        df.columns = ['%s_%s' % (column,str(key))]
        df_temp_List.append(df)

    df = pd.concat(df_temp_List, axis=1)
    df_temp_List = [] # clear list

    #calcuate standard deviation and mean for each row of data 
    df_s = df.std(axis=1).to_frame() 
    df_s.columns = ['sd']
    
    df_m = df.mean(axis=1).to_frame() 
    df_m.columns = ['mean']
    
    df_s = pd.concat([df_s, df_m], axis=1)
    
    # calculate standard error by dividing by the square root of the sample number 
    df_s['se'] = df_s['sd'] / math.sqrt(len(measurement_dictionary))
    
    return df_s

In [37]:
## function for plotting different sensors and their statistics
# example of calling: create_figure(df_concat,"NO2 (ppb)", measurement_dictionary)
def create_figure(the_df, column_name, m_dict, save):
    # Create figure
    fig = go.Figure()

    df_stats = get_stats(the_df, column_name)

    # add the bands for sds and means
    # first get the lists of values into the right format
    x_list = list(df_stats.index)
    
    ycentre = list(df_stats["mean"])
    yupper1 = list(df_stats['mean'] + df_stats['sd'])
    yupper2 = list(df_stats['mean'] + (2* df_stats['sd']))
    ylower1 = list(df_stats['mean'] - df_stats['sd'])
    ylower2 = list(df_stats['mean'] - (2*df_stats['sd']))

    fig.add_trace(go.Scatter(
        name='Upper Bound',
        x=x_list,
        y=yupper2,
        mode='lines',
        marker=dict(color="#444"),
        line=dict(width=0),
        hoverinfo="skip",
        showlegend=False
    ))
    fig.add_trace(go.Scatter(
        name='Lower Bound',
        x=x_list,
        y=ylower2,
        marker=dict(color="#444"),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(68, 68, 68, 0.1)',
        fill='tonexty',
        hoverinfo="skip",
        showlegend=False
    ))
    ## the above is one way of doing a filled error band.
    # Below is an alternative way...
    fig.add_trace(go.Scatter(
            x=x_list+x_list[::-1], # x, then x reversed
            y=yupper1+ylower1[::-1], # upper, then lower reversed
            fill='toself',
            fillcolor='rgba(68,68,68,0.2)',
            line=dict(color='rgba(255,255,255,0)'),
            hoverinfo="skip",
            showlegend=False
        ))

    fig.add_trace(go.Scatter(x=x_list, y=ycentre, name= 'mean', line=dict(color='rgba(0,100,80,0.5)')))

    #look for each sensor data by its key
    for key in m_dict:
        #df_t = df_temp.loc[str(key)]
        df_t = the_df.loc[str(key)]
        
        fig.add_trace(go.Scatter(x=list(df_t.index), y=list(df_t[column_name]), name= str(key)))

    # Set title
    fig.update_layout(
        title_text = "Time series for %s measurements for Lucy's sensors 1-5" % column_name
    )

    # Add range slider
    fig.update_layout(
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=1,
                         label="1d",
                         step="day",
                         stepmode="backward"),
                    dict(count=7,
                         label="1w",
                         step="day",
                         stepmode="backward"),
                    dict(step="all")
                ])
            ),
            rangeslider=dict(
                visible=True
            ),
            type="date"
        ),
        yaxis=dict(
           autorange = True,
           fixedrange= False
       )

    )

    if save == True:
        directory = "..\..\data\flow\html\Minute\%s.html" % (column_name)
        fig.write_html(directory) 
    else:
        fig.show()

In [38]:
create_figure(df,"NO2 (ppb)", measurement_dictionary,False)

# Interpolation smoothing: to smooth out dropped 0s 

In [39]:
#dropping AQI columns 
for col in df.columns:
    if "Plume AQI" in col:
        df.drop(col, axis=1, inplace=True)

for col in df.columns:
    df = df.where(df[col] > 0).interpolate(method='linear',limit_direction='forward', axis=0)

# Data visualisation

In [40]:
for col in df.columns:
    create_figure(df,col, measurement_dictionary,False)

In [41]:
# create_figure(df,"NO2 (ppb)", measurement_dictionary,False)
# create_figure(df, "VOC (ppb)", measurement_dictionary,False)