## imports

In [1]:
from os import environ as ENV
from datetime import datetime, timezone, timedelta
from dotenv import load_dotenv
from pymssql import connect
import pandas as pd
from boto3 import client

## load data in

In [2]:
load_dotenv()

True

In [3]:
def get_db_connection(config: dict) -> connect:
    """Returns database connection."""

    return connect(
        server=config["DB_HOST"],
        port=config["DB_PORT"],
        user=config["DB_USER"],
        database=config["DB_NAME"],
        password=config["DB_PASSWORD"],
        as_dict=True
    )

In [4]:
connection = get_db_connection(ENV)

In [5]:
def get_df(conn: connect) -> pd.DataFrame:
    """Returns a Dataframe of method data from database."""

    query = """ 
            SELECT *
            FROM s_beta.recording AS r
            FULL JOIN s_beta.plant AS p
                ON r.plant_id = p.plant_id
            """
    
    with conn.cursor() as cur:
        cur.execute(query)
        rows = cur.fetchall()

    df = pd.DataFrame(rows)[
        ["plant_id", "plant_name", "scientific_name", "recording_taken", "soil_moisture", "temperature"]]
    
    return df

In [6]:
data = get_df(connection)

In [7]:
data = data.astype({"soil_moisture":"float64",
                    "temperature":"float64"})

In [8]:
data['recording_taken'] = pd.to_datetime(data['recording_taken'], utc=True)

In [9]:
data.dtypes

plant_id                         int64
plant_name                      object
scientific_name                 object
recording_taken    datetime64[ns, UTC]
soil_moisture                  float64
temperature                    float64
dtype: object

In [10]:
data

Unnamed: 0,plant_id,plant_name,scientific_name,recording_taken,soil_moisture,temperature
0,0,Epipremnum Aureum,Epipremnum aureum,2024-04-17 09:44:01+00:00,31.4063,13.1995
1,1,Venus flytrap,,2024-04-17 09:44:01+00:00,29.7548,12.0470
2,2,Corpse flower,,2024-04-17 09:44:00+00:00,35.9540,9.1711
3,3,Rafflesia arnoldii,,2024-04-17 09:44:04+00:00,34.6505,10.0300
4,4,Black bat flower,,2024-04-17 09:44:10+00:00,26.7741,11.3474
...,...,...,...,...,...,...
1404,45,Begonia,Begonia 'Art Hodes',2024-04-17 10:30:45+00:00,28.8367,9.0107
1405,46,Medinilla Magnifica,Medinilla magnifica,2024-04-17 10:30:44+00:00,32.7024,13.3235
1406,47,Calliandra Haematocephala,Calliandra haematocephala,2024-04-17 10:30:39+00:00,-8.0308,13.3452
1407,48,Zamioculcas Zamiifolia,Zamioculcas zamiifolia,2024-04-17 10:30:43+00:00,29.7128,14.8957


## transform data

### summary

In [11]:
def get_summary(df: pd.DataFrame) -> pd.DataFrame:
    """Gets 1 mean per parameter per plant.
    Returns pd.DF."""
    
    df = df.drop(columns=["recording_taken"])
    
    df = df.groupby(["plant_id", "plant_name", "scientific_name"],as_index=False
                    ).agg(["mean", "std", "min", "max"]
                          ).droplevel(1, axis=1)
    
    df.columns = ['plant_id', 'plant_name', 'scientific_name'] + \
        [param+"_"+metric
         for param in ['soil_moisture', 'temperature']
         for metric in ["mean", "std", "min", "max"]]
    
    return df

In [12]:
summary = get_summary(data)

In [13]:
summary

Unnamed: 0,plant_id,plant_name,scientific_name,soil_moisture_mean,soil_moisture_std,soil_moisture_min,soil_moisture_max,temperature_mean,temperature_std,temperature_min,temperature_max
0,0,Epipremnum Aureum,Epipremnum aureum,29.596043,0.613463,28.7117,31.4063,13.51618,1.378186,13.133,18.9927
1,5,Pitcher plant,Sarracenia catesbaei,34.21482,0.598728,33.3443,35.9741,11.17557,0.018648,11.1562,11.2455
2,6,Wollemi pine,Wollemia nobilis,31.067663,0.586784,30.2229,32.793,11.41829,2.520655,10.9444,24.7641
3,8,Bird of paradise,Heliconia schiedeana 'Fire and Ice',25.31638,0.631664,24.4029,27.1754,11.509633,0.024789,11.4838,11.6025
4,9,Cactus,Pereskia grandifolia,30.297279,0.620766,29.3999,32.0963,10.821162,1.244012,10.5765,17.289
5,11,Asclepias Curassavica,Asclepias curassavica,27.355143,16.998831,-62.5927,32.2118,12.240097,12.53676,9.4339,77.8178
6,14,Colocasia Esculenta,Colocasia esculenta,32.420624,0.606287,31.5578,34.1746,15.285662,11.661404,13.0901,75.9186
7,16,Euphorbia Cotinifolia,Euphorbia cotinifolia,-4.96404,0.906478,-6.2762,-2.2859,13.257107,0.10739,12.6945,13.3348
8,17,Ipomoea Batatas,Ipomoea batatas,29.424393,0.623308,28.5476,31.2443,10.005914,1.310406,9.7386,16.8184
9,19,Musa Basjoo,Musa basjoo,26.782776,17.625377,-64.8078,31.7957,17.105276,15.129073,13.8426,94.6848


### anomalies

In [14]:
anomalies = data.copy(deep=True)

In [15]:
def get_std(row: dict, df: pd.DataFrame, col: str) -> int:
    """Compare minutely value to mean of past hour;
    Returns std."""
    
    last_hour = pd.Timestamp(datetime.now(timezone.utc)-timedelta(hours=1))
    df['recording_taken'] = pd.to_datetime(df['recording_taken'], utc=True)
    last_hour_vals = df[df["recording_taken"] >= last_hour][col]
    
    mean = last_hour_vals.mean()
    std = last_hour_vals.std()
    
    nstd = (row[col] - mean) / std
    
    return nstd

In [16]:
anomalies["soil_moisture_nstd"] = data.apply(get_std,
                                             args=(data, "soil_moisture"),
                                             axis=1)

In [17]:
anomalies["temperature_nstd"] = data.apply(get_std,
                                           args=(data, "temperature"),
                                           axis=1)

In [20]:
anomalies = anomalies[(anomalies["soil_moisture_nstd"] <= -2.5) |
                      (anomalies["soil_moisture_nstd"] >= 2.5) |
                      (anomalies["temperature_nstd"] <= -2.5) |
                      (anomalies["temperature_nstd"] >= 2.5)]

Unnamed: 0,plant_id,plant_name,scientific_name,recording_taken,soil_moisture,temperature,soil_moisture_nstd,temperature_nstd
15,16,Euphorbia Cotinifolia,Euphorbia cotinifolia,2024-04-17 09:44:00+00:00,-2.2859,13.3101,-3.545474,0.161872
44,47,Calliandra Haematocephala,Calliandra haematocephala,2024-04-17 09:44:16+00:00,-4.0557,13.3605,-3.748973,0.169840
55,10,"Dragon tree,",,2024-04-17 09:56:16+00:00,27.5994,115.6278,-0.109139,16.336808
61,16,Euphorbia Cotinifolia,Euphorbia cotinifolia,2024-04-17 09:56:22+00:00,-3.3466,13.3348,-3.667437,0.165777
64,19,Musa Basjoo,Musa basjoo,2024-04-17 09:56:27+00:00,31.1110,94.6848,0.294639,13.026025
...,...,...,...,...,...,...,...,...
1329,16,Euphorbia Cotinifolia,Euphorbia cotinifolia,2024-04-17 10:29:29+00:00,-6.1853,13.2880,-3.993843,0.158378
1343,30,Ficus Elastica,Ficus elastica,2024-04-17 10:29:34+00:00,31.8087,64.3420,0.374864,8.229271
1359,47,Calliandra Haematocephala,Calliandra haematocephala,2024-04-17 10:29:38+00:00,-7.9430,13.3441,-4.195951,0.167247
1377,16,Euphorbia Cotinifolia,Euphorbia cotinifolia,2024-04-17 10:30:33+00:00,-6.2762,13.2895,-4.004295,0.158616


## load data

In [None]:
S3 = client('s3',
            aws_access_key_id=ENV["AWS_ACCESS_KEY_ID"],
            aws_secret_access_key=ENV["AWS_SECRET_ACCESS_KEY"])

In [None]:
def upload_object(client: client, file: str, bucket: str, key: str) -> None:
    """Upload file to S3 bucket.
    Returns nothing."""
    
    client.upload_file(file, bucket, key)

In [None]:
# upload_object(mean.csv, "late-ordovician", )

## clear database