## Analyse (simulated) sensor data

### Setup

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import pandera as pa

In [None]:
DATA_PATH = Path.cwd().parent / "data"

In [None]:
!ls $DATA_PATH

## Custom functions

In [None]:
def plot_data_histogram(df, plot_column_name, normalise=True):
    plt.figure(figsize=(10, 6))
    plt.hist(
        df[plot_column_name],
        bins=10,
        color="blue",
        edgecolor="black",
        density=normalise,
    )
    plt.title(f"Histogram of {plot_column_name}")
    plt.xlabel(f"{plot_column_name}")
    plt.ylabel("Frequency")
    plt.show()
    return None

In [None]:
def calc_summary_stats(df):
    display(df[["time_diff", "Sensor value"]].describe().transpose())

## Data Validation

In [None]:
def validate_data(df):
    schema = pa.DataFrameSchema(
        {
            "time_diff": pa.Column(float, checks=pa.Check.ge(0)),
            "Sensor value": pa.Column(float, checks=pa.Check.gt(0)),
        }
    )
    validated_df = schema(df)
    return validated_df

## Analyse Sensor Data

- Look at pulling the data from the parquet files and into DuckDB for analysis
- And/or use Polars for analysis
- And/or use Pandas for analysis

Also take a look at `pandera` for data validation for early alert of data issues

https://pandera.readthedocs.io/en/stable/

e.g. example: make sure that all of the sensor data is within a certain range

In [None]:
data_df = []
for csv_file in DATA_PATH.glob("*.csv"):
    print(f"Reading: {csv_file}")
    df = pd.read_csv(
        csv_file,
        date_format="%Y-%m-%d %H:%M:%S.%f",
        parse_dates=["Collection time"],
        index_col=0,
    )
    df.fillna(value=0, limit=1, inplace=True)  # for time_diff column first row
    data_df.append(df)

In [None]:
for df in data_df:
    if df.equals(validate_data(df)):
        print("INFO: Data passes validation checks")
        calc_summary_stats(df)
        plot_data_histogram(df, "time_diff")
    else:
        print("ERROR: Data is invalid")

In [None]:
pd.__version__