# Importing and locating csv files from a specified directory

In [59]:
#conda install -c plotly plotly

In [60]:
#read in varaibles from other notebook
%store -r sensorPaths
print(sensorPaths)

{'1239564': '.\\data\\flow\\task_206_1629981045\\sensor_1239564\\user_measures_20210805_20210824_1.csv', '1239791': '.\\data\\flow\\task_206_1629981045\\sensor_1239791\\user_measures_20210806_20210824_1.csv', '16397': '.\\data\\flow\\task_206_1629981045\\sensor_16397\\sensor_measures_20210805_20210824_1.csv', '17539': '.\\data\\flow\\task_206_1629981045\\sensor_17539\\sensor_measures_20210806_20210824_1.csv', '18699': '.\\data\\flow\\task_206_1629981045\\sensor_18699\\sensor_measures_20210731_20210824_1.csv'}


# Intialising the measurement dataframes

In [61]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import sys
import numpy as np
import matplotlib.dates as mdates

In [62]:
def dataSplit(csvpath,dateString):

    data = {} #intialise empty dictionary to store each day of records

    df_temp = pd.read_csv(csvpath,parse_dates=True, index_col="timestamp")

    #dropping duplicates only used for individual exports which sometimes have duplicate values. 
    df_temp.drop_duplicates(keep='last', inplace= True)

    #convert index from float to int
    df_temp.index = df_temp.index.astype(int,copy=False)

    # using the dates which are already supplied. This strategy in the line below converts them and rounds down to date using 'd' flag
    # This strategy (line below) will keep just the date
    df_temp['day'] = pd.to_datetime(df_temp[dateString], dayfirst=True, errors='coerce').dt.date

    the_unique_dates = df_temp['day'].unique()
    #print('Unique dates:',the_unique_dates)
    # this gives the same result as the for loop below
  
    #splitting the dataframe into separate days
    #for each day in unique dates set:
    for day in the_unique_dates:
        try:
            # In my code below I assign the subset of records to a new dataframe called dft
            # create 'midnight' timestamps
            timestampKey = int((pd.to_datetime(day, errors='coerce')).timestamp())

            # select the records for this day
            dft = df_temp[df_temp['day']==day]

            # #drop the date column to save space (we don't need this anymore)
            #dft = dft.drop(dateString, axis=1)
            dft = dft.drop("day", axis=1)
            
            #resampling data requires datetime index, 
            #therefore we must cast the date column to DateTimeIndex type since it is dtype is currently object
            
            #dft = dft.reset_index() #reset index so we don't accidentally overwrite timestamp column
            dft['date'] = pd.to_datetime(dft['date'])
            dft = dft.set_index([dateString])
            dft = dft.resample('1min').bfill() #resample data to every minute
            
            data[timestampKey] = dft

        except KeyError as e:
            print(e)

    return data

In [63]:
measurement_dictionary = {}
#bringing the csv to pandas dataframes
for key in sensorPaths:
    measurement_dictionary[key] = dataSplit(sensorPaths[key],"date")

# Visualing the first or second week of data

In [68]:
from datetime import timezone
import datetime

## first week of data
#timestampStart = datetime.datetime(2021, 8, 6)
#timestampEnd = datetime.datetime(2021, 8, 13)

# second week of data
# timestampStart = datetime.datetime(2021, 8, 14)
# timestampEnd = datetime.datetime(2021, 8, 22)

#mortimer forest 
timestampStart = datetime.datetime(2021, 8, 10)
timestampEnd = datetime.datetime(2021, 8, 10)

timestampStart = int(timestampStart.replace(tzinfo=timezone.utc).timestamp())
timestampEnd = int(timestampEnd .replace(tzinfo=timezone.utc).timestamp())


In [69]:
df_concat = pd.DataFrame #intialise empty dataframe
df_measurement_List = []
df_concat_List = []

for key in measurement_dictionary:
    for timestampKey in measurement_dictionary[key]:
        #Get all the data within the timeframe
        if(timestampKey >= timestampStart and timestampKey <= timestampEnd):
            df = measurement_dictionary[key][timestampKey]
            df_measurement_List.append(df)

    #put each sensor concatenated data into a list
    df_concat_List.append(pd.concat(df_measurement_List))
    df_measurement_List = []

#put each sensor data list into one concatenated dataframe
df_concat = pd.concat(df_concat_List, keys= measurement_dictionary.keys())

#clear lists
df_measurement_List = [] 
df_concat_List = []

# Data Visualisation N02

# Time Series graphs for no2

In [70]:
import plotly.graph_objects as go

# Calculating the standard deviation in time series 

## Working out standard deviation 

In [71]:
def standardDeviation(dft,column):
    df_temp_List = []
    for key in measurement_dictionary:
        df = dft.loc[str(key)][column]

        df = df.to_frame()
        df.columns = ['%s_%s' % (column,str(key))]
        df_temp_List.append(df)

    df = pd.concat(df_temp_List, axis=1)
    df_temp_List = [] # clear list

    #calcuate standard deviation for each row of data 
    df_s = df.std(axis=1).to_frame() 
    df_s.columns = ['sd']
    
    return df_s

In [72]:
#df_no2 = standardDeviation(df_concat,'NO2 (ppb)')

In [73]:
def visualise_column(the_df, variable, sd):
    # Create figure
    fig = go.Figure()

    #look for each sensor data by its key
    for key in measurement_dictionary:
        df_t = the_df.loc[str(key)]
        fig.add_trace(go.Scatter(x=list(df_t.index), y=list(df_t[variable]), name= key))

    #plot std if requested
    if(sd):
        df_sd = standardDeviation(the_df, variable)
        fig.add_trace(go.Scatter(x=list(df_sd.index), y=list(df_sd['sd']), name= 'standard deviation'))

    # Set title
    fig.update_layout(
        title_text="Time series for %s measurements with range slider and selectors" % variable
    )

    # Add range slider
    fig.update_layout(
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=1,label="1d",step="day",stepmode="backward"),
                    dict(count=7,label="1w",step="day",stepmode="backward"),
                    dict(step="all")
                ])
            ),
            rangeslider=dict(
                visible=True
            ),
            type="date"
        ),
        yaxis=dict(
           autorange = True,
           fixedrange= False
       )
    
    )

    fig.show()
    
visualise_column(df_concat, 'NO2 (ppb)', True)

# Interpolation smoothing: to smooth out dropped 0s 

In [13]:
df_temp = df_concat

df_temp = df_temp.where(df_temp['NO2 (ppb)'] > 0).interpolate(method='linear',limit_direction='forward', axis=0)
df_temp = df_temp.where(df_temp['VOC (ppb)'] > 0).interpolate(method='linear',limit_direction='forward', axis=0)

In [14]:
df_no2 = standardDeviation(df_temp,'NO2 (ppb)')
df_voc = standardDeviation(df_temp,'VOC (ppb)')


In [15]:
# Create figure
fig = go.Figure()

#look for each sensor data by its key
for key in measurement_dictionary:
    df_t = df_temp.loc[str(key)]
    fig.add_trace(go.Scatter(x=list(df_t.index), y=list(df_t["NO2 (ppb)"]), name= str(key)))
    
#plot std no2
fig.add_trace(go.Scatter(x=list(df_no2.index), y=list(df_no2['sd']), name= 'standard deivation NO2'))

# Set title
fig.update_layout(
    title_text="Time series for NO2 (ppb) measurements for Lucy's sensors 1-5"
)

# Add range slider
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1d",
                     step="day",
                     stepmode="backward"),
                dict(count=7,
                     label="1w",
                     step="day",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    ),
    yaxis=dict(
       autorange = True,
       fixedrange= False
   )
    
)

fig.show()

# Data visulisation

In [16]:
# Create figure
fig = go.Figure()

#look for each sensor data by its key
for key in measurement_dictionary:
    df_t = df_temp.loc[str(key)]
    fig.add_trace(go.Scatter(x=list(df_t.index), y=list(df_t["VOC (ppb)"]), name= str(key)))


#plot std voc
fig.add_trace(go.Scatter(x=list(df_voc.index), y=list(df_voc['sd']), name= 'standard deivation VOC'))

# Set title
fig.update_layout(
    title_text="Time series for VOC (ppb) measurements for Lucy's sensors 1-5"
)

# Add range slider
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1d",
                     step="day",
                     stepmode="backward"),
                dict(count=7,
                     label="1w",
                     step="day",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    ),
    yaxis=dict(
       autorange = True,
       fixedrange= False
   )
    
)

fig.show()

In [17]:
# print(len(df_concat.index[df_concat['NO2 (ppb)'] == 0].tolist()))
# print(len(df_temp.index[df_temp['NO2 (ppb)'] == 0].tolist()))

# Changepoint detection 

In [18]:
# import ruptures as rpt # pip install ruptures
