## imports

In [1]:
from os import environ as ENV
from datetime import datetime, timezone, timedelta
from dotenv import load_dotenv
from pymssql import connect
import pandas as pd
from boto3 import client

## load data in

In [2]:
load_dotenv()

True

In [3]:
def get_db_connection(config: dict) -> connect:
    """Returns database connection."""

    return connect(
        server=config["DB_HOST"],
        port=config["DB_PORT"],
        user=config["DB_USER"],
        database=config["DB_NAME"],
        password=config["DB_PASSWORD"],
        as_dict=True
    )

In [4]:
connection = get_db_connection(ENV)

In [5]:
def get_df(conn: connect) -> pd.DataFrame:
    """Returns a Dataframe of method data from database."""

    query = """ 
            SELECT *
            FROM s_beta.recording AS r
            FULL JOIN s_beta.plant AS p
                ON r.plant_id = p.plant_id
            """
    
    with conn.cursor() as cur:
        cur.execute(query)
        rows = cur.fetchall()

    df = pd.DataFrame(rows)[
        ["plant_id", "plant_name", "scientific_name", "recording_taken", "soil_moisture", "temperature"]]
    
    return df

In [6]:
data = get_df(connection)

In [7]:
data = data.astype({"soil_moisture":"float64",
                    "temperature":"float64"})

In [8]:
data['recording_taken'] = pd.to_datetime(data['recording_taken'], utc=True)

In [9]:
data.dtypes

plant_id                         int64
plant_name                      object
scientific_name                 object
recording_taken    datetime64[ns, UTC]
soil_moisture                  float64
temperature                    float64
dtype: object

In [10]:
data

Unnamed: 0,plant_id,plant_name,scientific_name,recording_taken,soil_moisture,temperature
0,0,Epipremnum Aureum,Epipremnum aureum,2024-04-17 09:44:01+00:00,31.4063,13.1995
1,1,Venus flytrap,,2024-04-17 09:44:01+00:00,29.7548,12.0470
2,2,Corpse flower,,2024-04-17 09:44:00+00:00,35.9540,9.1711
3,3,Rafflesia arnoldii,,2024-04-17 09:44:04+00:00,34.6505,10.0300
4,4,Black bat flower,,2024-04-17 09:44:10+00:00,26.7741,11.3474
...,...,...,...,...,...,...
1969,45,Begonia,Begonia 'Art Hodes',2024-04-17 10:42:41+00:00,28.1450,9.0428
1970,46,Medinilla Magnifica,Medinilla magnifica,2024-04-17 10:42:43+00:00,32.0404,13.3503
1971,47,Calliandra Haematocephala,Calliandra haematocephala,2024-04-17 10:42:42+00:00,-9.0637,13.3597
1972,48,Zamioculcas Zamiifolia,Zamioculcas zamiifolia,2024-04-17 10:42:41+00:00,29.0108,14.9089


## transform data

### summary

In [11]:
def get_summary(df: pd.DataFrame) -> pd.DataFrame:
    """Gets 1 mean per parameter per plant.
    Returns pd.DF."""
    
    df = df.drop(columns=["recording_taken"])
    
    df = df.groupby(["plant_id", "plant_name", "scientific_name"],as_index=False
                    ).agg(["mean", "std", "min", "max"]
                          ).droplevel(1, axis=1)
    
    df.columns = ['plant_id', 'plant_name', 'scientific_name'] + \
        [param+"_"+metric
         for param in ['soil_moisture', 'temperature']
         for metric in ["mean", "std", "min", "max"]]
    
    return df

In [12]:
summary = get_summary(data)

In [13]:
summary

Unnamed: 0,plant_id,plant_name,scientific_name,soil_moisture_mean,soil_moisture_std,soil_moisture_min,soil_moisture_max,temperature_mean,temperature_std,temperature_min,temperature_max
0,0,Epipremnum Aureum,Epipremnum aureum,29.252351,0.784021,28.0125,31.4063,13.718085,2.129164,13.133,24.783
1,5,Pitcher plant,Sarracenia catesbaei,33.861864,0.764146,32.6604,35.9741,11.183536,0.020549,11.1562,11.2455
2,6,Wollemi pine,Wollemia nobilis,30.736815,0.751684,29.5543,32.793,11.3002,2.155324,10.9444,24.7641
3,8,Bird of paradise,Heliconia schiedeana 'Fire and Ice',24.968307,0.800493,23.685,27.1754,11.519598,0.027352,11.4838,11.6025
4,9,Cactus,Pereskia grandifolia,29.92502,0.790625,28.7012,32.0963,10.75922,1.045374,10.5765,17.289
5,11,Asclepias Curassavica,Asclepias curassavica,27.892567,14.322608,-62.5927,32.2118,11.45101,10.619034,9.4339,77.8178
6,14,Colocasia Esculenta,Colocasia esculenta,32.059973,0.768736,30.8823,34.1746,14.919751,9.912366,12.7846,75.9186
7,16,Euphorbia Cotinifolia,Euphorbia cotinifolia,-5.49695,1.155034,-7.2978,-2.2859,13.34364,0.494299,12.6945,16.4157
8,17,Ipomoea Batatas,Ipomoea batatas,29.056537,0.787044,27.8402,31.2443,9.943915,1.100707,9.7386,16.8184
9,19,Musa Basjoo,Musa basjoo,27.385027,14.777252,-64.8078,31.7957,16.1584,12.745316,13.8426,94.6848


### anomalies

In [14]:
anomalies = data.copy(deep=True)

In [15]:
def get_std(row: dict, df: pd.DataFrame, col: str) -> int:
    """Compare minutely value to mean of past hour;
    Returns std."""
    
    last_hour = pd.Timestamp(datetime.now(timezone.utc)-timedelta(hours=1))
    # df['recording_taken'] = pd.to_datetime(df['recording_taken'], utc=True)
    last_hour_vals = df[(df["plant_id"] == row["plant_id"]) &
                        (df["recording_taken"] >= last_hour)][col]
    
    mean = last_hour_vals.mean()
    std = last_hour_vals.std()
    
    nstd = (row[col] - mean) / std
    
    return nstd

In [16]:
anomalies["soil_moisture_nstd"] = data.apply(get_std,
                                             args=(data, "soil_moisture"),
                                             axis=1)

In [17]:
anomalies["temperature_nstd"] = data.apply(get_std,
                                           args=(data, "temperature"),
                                           axis=1)

In [18]:
anomalies = anomalies[(anomalies["soil_moisture_nstd"] <= -2.5) |
                      (anomalies["soil_moisture_nstd"] >= 2.5) |
                      (anomalies["temperature_nstd"] <= -2.5) |
                      (anomalies["temperature_nstd"] >= 2.5)]

In [19]:
anomalies

Unnamed: 0,plant_id,plant_name,scientific_name,recording_taken,soil_moisture,temperature,soil_moisture_nstd,temperature_nstd
0,0,Epipremnum Aureum,Epipremnum aureum,2024-04-17 09:44:01+00:00,31.4063,13.1995,2.747309,-0.243563
1,1,Venus flytrap,,2024-04-17 09:44:01+00:00,29.7548,12.0470,2.774694,1.573818
2,2,Corpse flower,,2024-04-17 09:44:00+00:00,35.9540,9.1711,2.775231,-0.200689
3,3,Rafflesia arnoldii,,2024-04-17 09:44:04+00:00,34.6505,10.0300,2.824044,-0.153768
4,4,Black bat flower,,2024-04-17 09:44:10+00:00,26.7741,11.3474,2.905211,-0.132663
...,...,...,...,...,...,...,...,...
1867,36,Tacca Integrifolia,Tacca integrifolia,2024-04-17 10:40:46+00:00,30.5906,27.7299,-1.420957,6.312333
1878,48,Zamioculcas Zamiifolia,Zamioculcas zamiifolia,2024-04-17 10:40:46+00:00,29.1240,15.8822,-1.394196,6.230200
1903,24,Ficus,Ficus carica,2024-04-17 10:41:31+00:00,28.5065,25.3012,-1.473534,6.246752
1927,0,Epipremnum Aureum,Epipremnum aureum,2024-04-17 10:42:27+00:00,28.0125,24.7830,-1.581400,5.196836


## load data

### save to CSVs

### upload to S3

In [None]:
S3 = client('s3',
            aws_access_key_id=ENV["AWS_ACCESS_KEY_ID"],
            aws_secret_access_key=ENV["AWS_SECRET_ACCESS_KEY"])

In [None]:
def upload_object(client: client, file: str, bucket: str, key: str) -> None:
    """Upload file to S3 bucket.
    Returns nothing."""
    
    client.upload_file(file, bucket, key)

In [None]:
# upload_object(mean.csv, "late-ordovician", )

## clear database