# QA/QC for MODULAIR-PM Batch 3.1

The goal of this notebook is to determine whether any devices should be fixed or held-back from shipment after a calibration/QA-QC period.

## About the QA/QC Process

Each batch contains ~50 MODULAIR-PM sensors that are co-located on our rooftop chamber at Greentown Labs. All sensors should be seeing the same air and thus should show the same result. The objective of this notebook is to identify those that do not.

In [None]:
import quantaq
import plotly.express as px
import pandas as pd
from quantaq.utils import to_dataframe
from tqdm.notebook import tqdm
from scipy.stats import variation
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import numpy as np
import random
from scipy.stats import probplot
import plotly.graph_objects as go
from sklearn.neighbors import NearestNeighbors

# Define Variables

Here, we define the batch, start, and stop dates.

In [None]:
batch = "Batch 3.1"

start = "2021-05-30 09:00:00"
end = "2021-05-31 10:00:00"
resample_length = "1min"
n_samples = (pd.to_datetime(end) - pd.to_datetime(start))/pd.Timedelta(resample_length)

sensor_cols = ['met.pressure', 'met.rh', 'met.temp', 'neph.bin0', 'neph.bin1', 'neph.bin2', 'neph.bin3', 'neph.bin4', 'neph.bin5', 'neph.pm1', 'neph.pm10', 'neph.pm25', 'opc.bin0', 'opc.bin1', 'opc.bin10', 'opc.bin11', 'opc.bin12', 'opc.bin13', 'opc.bin14', 'opc.bin15', 'opc.bin16', 'opc.bin17', 'opc.bin18', 'opc.bin19', 'opc.bin2', 'opc.bin20', 'opc.bin21', 'opc.bin22', 'opc.bin23', 'opc.bin3', 'opc.bin4', 'opc.bin5', 'opc.bin6', 'opc.bin7', 'opc.bin8', 'opc.bin9', 'opc.pm1', 'opc.pm10', 'opc.pm25', 'opc.rh', 'opc.temp']
opc_cols = ['opc.bin0', 'opc.bin1', 'opc.bin2', 'opc.bin3', 'opc.bin4', 'opc.bin5', 'opc.bin6', 'opc.bin7', 'opc.bin8', 'opc.bin9']

with open('apikey.txt') as f:
    api_key = f.read().replace("\n","")

# Extract the data

Use py-quantaq to pull all data. Each batch of sensors is assigned to a Team and can be referenced as such.

In [None]:
# Setup the API Client
client = quantaq.QuantAQAPIClient(api_key=api_key)


# Retrieve the devices
devices = client.devices.list(team="Batch 3.1")

## Use the API to get all data

In [None]:
frames = []

with tqdm(total=len(devices), desc="API Download") as pbar:
    for each in devices:
        if (np.datetime64(each['last_seen']) > np.datetime64(start)) & (np.datetime64(each['created']) < np.datetime64(end)):
            data = client.data.list(sn=each["sn"], start=start, stop=end, raw=True, per_page=500)
            frame = to_dataframe(data)
            if len(frame)>0:
                nonsensor_cols = ['flag', 'sn', 'timestamp', 'timestamp_local', 'url', 'geo.lat', 'geo.lon']
                if frame.drop(nonsensor_cols, axis=1).isna().sum().sum() == 0:
                    # Resample?
                    frame = frame.resample(resample_length, on='timestamp').mean().reset_index()
                    # frame = frame.resample("5S", on='timestamp').mean().reset_index()
                    frame["sn"] = each["sn"]
                    # Append
                    frames.append(frame)
                else:
                    print("WARNING: Unit " + str(each["sn"]) + "returning unexpected missing values")
            else:
                print("WARNING: Unit " + str(each["sn"]) + " not recording in timeframe")
        else:
            print("WARNING: Unit " + str(each["sn"]) + " not connecting in timeframe")
        pbar.update(1)
    
frames = pd.concat(frames)

# Set the datatype for the flag
frames["flag"] = frames["flag"].astype("int8", errors='ignore')

# Drop empty columns
frames = frames.dropna(how='all', axis=1)

In [None]:
# Pick out frames that give a lot of NaNs
frames_nnas = frames[["sn","opc.bin0"]].groupby("sn").apply(lambda x: (~x.isna()).sum())["opc.bin0"]
highna_sn = list(frames_nnas.index[frames_nnas < (n_samples * 0.85)])
if len(highna_sn) > 0:
    print("WARNING: The following units have a lot of missing data: " + str(highna_sn))
    frames = frames[~frames["sn"].isin(highna_sn)]
    
frames.set_index(frames["timestamp"], inplace=True)

In [None]:
frames_nnas = frames[["sn","opc.bin0"]].groupby("sn").apply(lambda x: (~x.isna()).sum())["opc.bin0"]
(frames_nnas/n_samples).sort_values().head()

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

## Export the data

Export the data in case we want to re-visit later since the API takes a few minutes depending on length of time downloaded.

In [None]:
# Export as feather format
frames.reset_index().to_feather("{}.{}.{}.feather".format(
    batch.lower().replace(" ", "."), start, end),
)

### Import the data

Importing the data if we have already have a batch saved.

In [None]:
path = "{}.{}.{}.feather".format(batch.lower().replace(" ", "."), start, end)
frames = pd.read_feather(path)

## Munge the data a bit

In [None]:
var="neph.bin1"
fig = px.scatter(frames, x="timestamp", y=var, color="sn", title=var+" Comparison", render_mode="webgl")

fig.show()

In [None]:
rollingex.head(10)

In [None]:
var = "neph.bin0"
rollingex = frames.set_index('timestamp', inplace=False)[["sn", var]].groupby("sn", as_index=False).rolling("15min", win_type='gaussian').mean()
rollingstd = frames.set_index('timestamp', inplace=False)[["sn", var]].groupby("sn", as_index=False).rolling("5min", win_type='gaussian').std()


fig = px.line(rollingex, x=rollingex.index.get_level_values(1), y=var, color=rollingex.index.get_level_values(0), title=var +" Comparison", render_mode="webgl")

fig.show()

In [None]:
df = temp
var = "neph.bin0"

fig = px.line(temp, x=temp.index.get_level_values(1), y="neph.bin0", color=temp.index.get_level_values(0), title="neph_steady Comparison", render_mode="webgl")

fig.show()

In [None]:
fig = px.line(neph_norm, x=neph_norm.index.get_level_values(1), y="neph.bin0", color=neph_norm.index.get_level_values(0), title="neph_norm Comparison", render_mode="webgl")

fig.show()

We can use a QQ plot to test for normality

In [None]:
# switch data_points definition to switch between looking at the distribution over all time or at a specific time
# data_points = frames[frames['timestamp']=='2021-05-08 09:42:00']['opc.bin0'].dropna().sort_values()[:] #we can drop outliers
data_points = frames['neph.bin0'].dropna().sort_values()[:]
# data_points = pd.Series(np.random.normal(0, 1, size=(500))).sort_values()[:-250]
# data_points = pd.Series(np.random.uniform(0, 1, size=(500)))
qq = probplot(data_points, dist='norm')
x = np.array([qq[0][0][0], qq[0][0][-1]])

fig = go.Figure()
fig.add_scatter(x=qq[0][0], y=qq[0][1], mode='markers')
fig.add_scatter(x=x, y=qq[1][1] + qq[1][0]*x, mode='lines')
fig.layout.update(showlegend=False)
fig.show()

In [None]:
# switch data_points definition to switch between looking at the distribution over all time or at a specific time
# data_points = frames[frames['timestamp']=='2021-05-08 09:42:00']['opc.bin0'].dropna().sort_values()[:] #we can drop outliers
data_points = frames['opc.bin0'].dropna().sort_values()[:]
# data_points = pd.Series(np.random.normal(0, 1, size=(500))).sort_values()[:-250]
# data_points = pd.Series(np.random.uniform(0, 1, size=(500)))
qq = probplot(data_points, dist='uniform')
x = np.array([qq[0][0][0], qq[0][0][-1]])

fig = go.Figure()
fig.add_scatter(x=qq[0][0], y=qq[0][1], mode='markers')
fig.add_scatter(x=x, y=qq[1][1] + qq[1][0]*x, mode='lines')
fig.layout.update(showlegend=False)
fig.show()

# Statistical Analysis

Here, we make some figures and compute some statistics to better understand which sensors are good to ship and which need more attention. This is an active area of research.

## Outliers by Device

Here, we show simple box plots for each variable of interest (`met.rh`, `met.temp`, `opc.bin0`, `neph.bin0`). This approach will indicate sensors that are, on average, outliers relative to the rest of the group.

In [None]:
fig = make_subplots(rows=2, cols=2)

l=frames
# Humidity
wide = l.set_index(["timestamp", "sn"]).unstack().xs("met.rh", axis=1)

fig.add_trace(
    go.Box(
        y=wide.mean(),
        name="met.rh",
        jitter=0.05,
        pointpos=-1.5,
        marker_size=5,
        boxpoints='all',
        hovertext=wide.columns
    ),
    row=1, col=1,
)

# Temperature
wide = l.set_index(["timestamp", "sn"]).unstack().xs("met.temp", axis=1)

fig.add_trace(
    go.Box(
        y=wide.mean(),
        name="met.temp",
        jitter=0.05,
        pointpos=-1.5,
        marker_size=5,
        boxpoints='all',
        hovertext=wide.columns
    ),
    row=1, col=2,
)

# OPC Bin0
wide = l.set_index(["timestamp", "sn"]).unstack().xs("opc.bin0", axis=1)

fig.add_trace(
    go.Box(
        y=wide.mean(),
        name="opc.bin0",
        jitter=0.05,
        pointpos=-1.5,
        marker_size=5,
        boxpoints='all',
        hovertext=wide.columns
    ),
    row=2, col=1,
)

# Neph Bin0
wide = l.set_index(["timestamp", "sn"]).unstack().xs("neph.bin0", axis=1)

fig.add_trace(
    go.Box(
        y=wide.mean(),
        name="neph.bin0",
        jitter=0.05,
        pointpos=-1.5,
        marker_size=5,
        boxpoints='all',
        hovertext=wide.columns
    ),
    row=2, col=2,
)

fig.update_layout(height=600, width=800, title_text="{} Comparison".format(batch))

fig.show()

## Coefficient of Variation

Here, we compute the CV for each of these variables and output as a table.

In [None]:
rv = []

for var in ["met.temp", "met.rh", "opc.bin0", "neph.bin0"]:
    # Build the frame
    wide = frames.set_index(["timestamp", "sn"]).unstack().xs(var, axis=1)
    
    # Compute the CV
    cv = wide.apply(variation, nan_policy='omit', axis=1)
    
    # Add the results to a table
    cv = cv.describe().to_dict()
    cv["var"] = var
    
    rv.append(cv)
    
rv = pd.DataFrame(rv)

rv

## Outlier Detection

This is an area that needs much more robust analysis from a mathematical perspective. We are most interested in identifying the sensors that are 'noisy', not neccesarily that read high or low. We may want to identify those as well, but it is not as important in the immediate term.

### Method 1 | Mean + StDev

The first approach will find the sensors that have the most individual points that are more than 1x and 2x standard deviations from the mean of the population of sensors. Thus, for each point in time, we compute the mean and standard deviation for the entire group of sensors. We then sum the total number of events where this occurs for each sensor. We can then eliminate sensors based on how frequently they fall outside of this range.

In [None]:
outliers = []

# list of dfs of lower and upper bound of cutoff
m1mins = []
m1maxs = []

for var in ["met.temp", "met.rh", "opc.bin0", "neph.bin0"]:
    # Build the frame
    wide = frames.set_index(["timestamp", "sn"]).unstack().xs(var, axis=1)
    

    # Compute the mean and standard deviation
    mean = wide.median(axis=1)
    std = wide.std(axis=1)
    
    m1mins.append((mean - 2*std).to_frame().rename(columns = {0:var}))
    m1maxs.append((mean + 2*std).to_frame().rename(columns = {0:var}))
    
    for c in wide.columns:
        noutliers = ((wide[c] < (mean - 2*std)) | (wide[c] > (mean + 2*std))).sum()

        # Find the number of non-nan rows
        nobs = wide[c].count()

        # Compute some stats
        rv = {
            "sn": c,
            "nobs": nobs,
            "outliers": noutliers,
            "outliers.pct": round(100.*noutliers/nobs, 1),
            "var": var,
            "sn": c
        }

        # Add to results
        outliers.append(rv)
    
outliers = pd.DataFrame(outliers)

m1min = pd.concat(m1mins, axis=1)
m1min["timestamp"] = m1min.index
m1max = pd.concat(m1maxs, axis=1)
m1max["timestamp"] = m1max.index

# Output the Figure
fig = px.scatter(outliers, 
         x="outliers.pct", y="var", color="sn", 
         log_x=True, height=500, width=700,
         title="Determining Outliers for {} using a Mean + StDev Approach".format(batch),
)

fig.show()

In [None]:
colorlist = ['aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'grey', 'green', 'greenyellow', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgrey', 'lightgreen', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen']

### Method 2 | 1.5*IQR

Here, we do the same thing as above, but instead of using the mean + 2*stdev to determine outliers, we use 1.5 * IQR.

In [None]:
outliers = []

# list of dfs of lower and upper bound of cutoff
m2mins = []
m2maxs = []

for var in ["met.temp", "met.rh", "opc.bin0", "neph.bin0"]:
    # Build the frame
    wide = frames.set_index(["timestamp", "sn"]).unstack().xs(var, axis=1)
    

    # Compute the mean and standard deviation
    q25 = wide.quantile(0.25, axis=1)
    q75 = wide.quantile(0.75, axis=1)

    iqr = q75 - q25

    m2mins.append((q25 - 1.5*iqr).to_frame().rename(columns = {0:var}))
    m2maxs.append((q75 + 1.5*iqr).to_frame().rename(columns = {0:var}))
    
    for c in wide.columns:
        noutliers = ((wide[c] < q25 - 1.5*iqr) | (wide[c] > q75 + 1.5*iqr)).sum()

        # Find the number of non-nan rows
        nobs = wide[c].count()

        # Compute some stats
        rv = {
            "sn": c,
            "nobs": nobs,
            "outliers": noutliers,
            "outliers.pct": round(100.*noutliers/nobs, 1),
            "var": var,
            "sn": c
        }

        # Add to results
        outliers.append(rv)
    
outliers = pd.DataFrame(outliers)

m2min = pd.concat(m2mins, axis=1)
m2min["timestamp"] = m2min.index
m2max = pd.concat(m2maxs, axis=1)
m2max["timestamp"] = m2max.index

sns = outliers[outliers["var"]=="neph.bin0"].sort_values(by = "outliers.pct", ascending = False)["sn"].reset_index(drop = True)

# Output the Figure
fig = px.scatter(outliers, 
         x="outliers.pct", y="var", color="sn", 
         log_x=True, height=500, width=700,
         title="Determining Outliers for {} using a 1.5*IQR Approach".format(batch),
)

fig.show()



In [None]:
fig = go.Figure()

times = m1min['timestamp'].index[::10]

# Add traces
models = frames["sn"].drop_duplicates()
colorsIdx = {i:k for i in models for k in random.choices(colorlist, k=len(models))}
cols = frames[frames["timestamp"].isin(times)]["sn"].map(colorsIdx)
fig.add_trace(go.Scattergl(x=frames[frames["timestamp"].isin(times)]["timestamp"],
                    y=frames[frames["timestamp"].isin(times)]["neph.bin0"],
                    text=frames[frames["timestamp"].isin(times)]["sn"],
                    hoverinfo='text',
                    mode='markers',
                    name='markers',
                    marker=dict(color=cols)))

fig.add_trace(go.Scattergl(x=m1min.loc[times]['timestamp'].index,
                    y=m1min.loc[times]['neph.bin0'],
                    line_color='green',
                    mode='lines',
                    name='min1'))
fig.add_trace(go.Scattergl(x=m1max.loc[times]['timestamp'].index,
                    y=m1max.loc[times]['neph.bin0'],
                    line_color='green',
                    mode='lines',
                    name='max1'))
fig.add_trace(go.Scattergl(x=m2min.loc[times]['timestamp'].index,
                    y=m2min.loc[times]['neph.bin0'],
                    line_color='red',
                    mode='lines',
                    name='min2'))
fig.add_trace(go.Scattergl(x=m2max.loc[times]['timestamp'].index,
                    y=m2max.loc[times]['neph.bin0'],
                    line_color='red',
                    mode='lines',
                    name='max2'))

fig.show()

## Method 3 | Neighbors
Here we look for how far a point is away from it's nearest neighbors

In [None]:
def nn(x):
    xd = x.drop(['timestamp','sn'], axis = 1)
    # if there are no neighbors, return 0
    if len(xd.dropna()) < 2:
        xd["dist"] = np.nan
    else:
        nbrs = NearestNeighbors(n_neighbors=2, # each point is counted as it's own neighbor, so n=2 is nearest 1 neighbor
                                algorithm='auto', 
                                metric='euclidean'
                               ).fit(np.array(xd.dropna()).reshape(-1,1))
        dists = []
        for p in np.array(xd).flatten():
            if np.isnan(p):
                dists.append(np.nan)
            else:
                distances, indices = nbrs.kneighbors(np.array([[p]]))
                dists.append(np.sum(distances))
        # save distances with "timestamp" and "sn" to be merged back into the frames df later
        xd = x[["timestamp","sn"]]
        xd["dist"] = dists
    return xd

In [None]:
# merge distances back into the origional df 
frames_dist = frames[['timestamp', 'sn', 'neph.bin0']].merge(frames[['timestamp','neph.bin0','sn']].groupby('timestamp').apply(nn),
                                                   how='left',
                                                   on=['timestamp','sn']
                                                   ).sort_values(by = ["timestamp","sn"])

In [None]:
# take powers of distances
frames_dist['d2'] =  frames_dist['dist']**2
frames_dist['d3'] =  frames_dist['dist']**3
frames_dist['d4'] =  frames_dist['dist']**4
# average within each device
sn_df = frames_dist[['sn','dist']].groupby('sn').mean().merge(frames_dist[['sn','d2']].groupby('sn').mean()**(1/2), on='sn')
sn_df = sn_df.merge(df[['sn','d3']].groupby('sn').mean()**(1/3), on='sn')
sn_df = sn_df.merge(df[['sn','d4']].groupby('sn').mean()**(1/4), on='sn')

In [None]:
# ordered list of devices with most suspect first
sns = pd.Series(sn_df.sort_values(by=['d2'], ascending=False).index)

## Method 4 | Difference
Here we look at the square difference in succesive values, and average over each device.

In [None]:
sensor_list = ['met.pressure', 'met.rh', 'met.temp', 'neph.bin0', 'neph.bin1', 'neph.bin2', 'neph.bin3', 'neph.bin4', 'neph.bin5', 'neph.pm1', 'neph.pm10', 'neph.pm25', 'opc.bin0', 'opc.bin1', 'opc.bin10', 'opc.bin11', 'opc.bin12', 'opc.bin13', 'opc.bin14', 'opc.bin15', 'opc.bin16', 'opc.bin17', 'opc.bin18', 'opc.bin19', 'opc.bin2', 'opc.bin20', 'opc.bin21', 'opc.bin22', 'opc.bin23', 'opc.bin3', 'opc.bin4', 'opc.bin5', 'opc.bin6', 'opc.bin7', 'opc.bin8', 'opc.bin9', 'opc.pm1', 'opc.pm10', 'opc.pm25', 'opc.rh', 'opc.temp']

frames = frames.sort_values(by = ["timestamp"])
# within each device for each sensor measurement find the difference from the previous measurement and square
for col in sensor_list:
    frames[col+".d"] = frames.groupby("sn")[col].diff()
    frames[col+".d"] = frames[col+".d"]**2

#resort
frames.sort_values(by = ["sn","timestamp"])

# average within each device and measurement
sn_df = frames[["sn"] + list(map(lambda x: x + ".d", sensor_list))].groupby("sn").mean()
# ordered list of devices with most suspect first
sns = pd.Series(sn_df.sort_values(by=["neph.bin0.d"], ascending=False).index) 

## Tool
A graphical tool to identify the problem sensors

In [None]:
# Create figure
fig = go.Figure()
#sns = frames["sn"].drop_duplicates().reset_index(drop=True) # this will give you and unordered sn list
var = "neph.bin0"

fig.add_trace(
    go.Scattergl(
        visible=True,
        mode='markers',
        marker=dict(opacity=0.2),
        name="fleet",       
        x=frames["timestamp"],
        y=frames[var]))

# Add traces, one for each slider step
for step in range(len(sns)):
    fig.add_trace(
        go.Scattergl(
            visible=False,
            name=str(sns[step]),
            mode='markers',
            x=frames[frames["sn"] == sns[step]]["timestamp"],
            y=frames[frames["sn"] == sns[step]][var]))
#            x=frames_list[step]["timestamp"],
#            y=frames_list[step][var]))

# Make 10th trace visible
fig.data[1].visible = True

# Create and add slider
steps = []
for i in range(len(fig.data)-1):
    showing = [True] + [False] * (len(fig.data)-1)
    showing[i+1] = True
    step = dict(
        label=sns[i],
        method="update",
        args=[{"visible": showing},
              {"title": "Unit: " + str(sns[i])}],  # layout attribute
    )
    steps.append(step)

sliders = [dict(
    active=0,
#    type="buttons",
    font={"size":10},
    buttons=steps
)]

fig.update_layout(
    updatemenus=sliders
)

fig.show()

## OPC Data

In [None]:
opc_full = frames[['opc.bin0', 'opc.bin1', 'opc.bin2', 'opc.bin3', 'opc.bin4', 'opc.bin5', 'opc.bin6', 'opc.bin7', 'opc.bin8', 'opc.bin9', 'sn', "timestamp"]].copy()
opc_full = opc_full.set_index(["sn", "timestamp"])
sns_corrections = opc_full.groupby(level=0).mean().sum(axis=1)
sns_corrections = sns_corrections.mean()/sns_corrections

In [None]:
opc_full_reg = opc_full.apply(lambda x: x.mul(sns_corrections, level=0))
opc_sns_reg = opc_full_reg.groupby(level="sn").mean()

In [None]:
len(opc_full.loc(slice('2021-05-08 14:30:00','2021-05-08 15:30:00'), :))

In [None]:
opc_sub = opc_full.loc[(slice(None),slice('2021-05-08 14:30:00','2021-05-08 15:30:00')),:]

In [None]:
pd.to_datetime('2021-05-08 14:30:00')

In [None]:
pd.to_datetime('2021-05-08 14:30:00')

In [None]:
len(opc_full)

In [None]:
df = opc_sub
fig = px.scatter(df, x="opc.bin0", y="opc.bin1", color=df.index.get_level_values(0), title="Neph Bin0 Comparison", render_mode="webgl")
fig.add_trace(
    go.Scatter(
        x=[100, 180],
        y=[50, 90],
        mode="lines",
        line=go.scatter.Line(color="gray"),
        showlegend=False)
)
fig.add_trace(
    go.Scatter(
        x=[100, 160],
        y=[56.25, 90],
        mode="lines",
        line=go.scatter.Line(color="gray"),
        showlegend=False)
)

fig.show(height=20)

In [None]:
fig = px.scatter(opc_sns_reg, x="opc.bin1", y="opc.bin2", color=opc_sns_reg.index.get_level_values(0), title="Neph Bin0 Comparison", render_mode="webgl")
fig.show()

In [None]:
opc_sns_reg

In [None]:


import dash
import dash_core_components as dcc
import dash_html_components as html

In [None]:
app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(id="graph", figure=fig),
])

app.run_server(debug=True)

In [None]:
%tb

In [None]:
opc_full = frames[['opc.bin0', 'opc.bin1', 'opc.bin2', 'opc.bin3', 'opc.bin4', 'opc.bin5', 'opc.bin6', 'opc.bin7', 'opc.bin8', 'opc.bin9', 'sn']].copy()
opc_sns = opc_full.groupby("sn").mean()

In [None]:
fig = px.scatter(opc_sns, x="opc.pm1", y="opc.pm10", color="opc.pm25", title="OPC Sorting", render_mode="webgl")

fig.show()

In [None]:
opc_full["correction"] = opc_full["sn"].map(dict(opc_sns["opc.bin0"]))

In [None]:
opc_full["correction"] = opc_full["sn"].map(dict(opc_sns["opc.bin0"]))

In [None]:
for col in ['opc.bin0', 'opc.bin1', 'opc.bin2', 'opc.bin3', 'opc.bin4', 'opc.bin5', 'opc.bin6', 'opc.bin7', 'opc.bin8', 'opc.bin9']:
    # opc_full[col + ".n1"] = opc_full[col]/opc_full["correction"]
    opc_sns[col + ".n1"] = opc_sns[col]/opc_sns.sum(axis=1)

In [None]:
opc_sns.sum(axis=1)

In [None]:
opc_sns.columns

In [None]:
opc_sns[['opc.bin1.n1', 'opc.bin2.n1', 'opc.bin3.n1', 'opc.bin4.n1', 'opc.bin5.n1', 'opc.bin6.n1', 'opc.bin7.n1', 'opc.bin8.n1', 'opc.bin9.n1']].corr()

In [None]:
opc_sns["sn"] = opc_sns.index

In [None]:
opc_sns_long = opc_sns.melt(id_vars=["sn"], var_name = 'bin', value_vars=['opc.bin1.n1', 'opc.bin2.n1', 'opc.bin3.n1', 'opc.bin4.n1', 'opc.bin5.n1', 'opc.bin6.n1', 'opc.bin7.n1', 'opc.bin8.n1', 'opc.bin9.n1'])

In [None]:
renameing = {'opc.bin1.n1':1, 'opc.bin2.n1':2, 'opc.bin3.n1':3, 'opc.bin4.n1':4, 'opc.bin5.n1':5, 'opc.bin6.n1':6, 'opc.bin7.n1':7, 'opc.bin8.n1':8, 'opc.bin9.n1':9}

In [None]:
opc_sns_long["bin"] = opc_sns_long["bin"].map(renameing)

In [None]:
fig = px.line(opc_sns_long, x="bin", y="value", color="sn", line_group="sn", title="OPC Sorting", render_mode="webgl", log_y=True)

fig.show()

In [None]:
pd.DataFrame([[1,3,3],[2,4,2],[3,5,1]]).corr()