In [2]:
import boto3
from os import path, makedirs
from datetime import datetime
import logging
from logging import Logger
import pandas as pd
from tqdm import tqdm

In [3]:
# S3 bucket and paths
BUCKET_NAME = "oedi-data-lake"

PARENT_PREFIX = "pvdaq/parquet/"
SITE_PREFIX = PARENT_PREFIX + "site/"
MOUNT_PREFIX =  PARENT_PREFIX + "mount/"

METRICS_PREFIX = PARENT_PREFIX + "metrics/metrics__system_{ss_id}"
PV_PREFIX = PARENT_PREFIX + "pvdata/system_id={ss_id}/year={year}/month={month}/day={day}/"

In [4]:
class PVExtract:

    def __init__(
            self,
            aws_access_key_id: str,
            aws_secret_access_key: str,
            region_name: str,
            staging_area: str,
            logger: Logger
            ):
        """
            Initializes the Extract step of the data pipeline
        """
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.region_name = region_name

        self.s3 = boto3.client(
            "s3",
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region_name=region_name
        )

        self.staging_area = staging_area
        self.logger = logger


    def s3_download(self, key: str, filename: str):
        """
            Given an AWS S3 file key, downloads it.
            File is named filename.
            Assumes filename has valid file path. (director already exists)
        """
        self.s3.download_file(BUCKET_NAME, key, filename)


    def extract_metadata(self) -> None:
        """
            Extracts PV system metadata
        """
        site_object = self.s3.list_objects(Bucket=BUCKET_NAME, Prefix=SITE_PREFIX, Delimiter="/")
        makedirs(self.staging_area, exist_ok=True)

        try:
            site_key = site_object["Contents"][0]["Key"]
            self.s3_download(site_key, path.join(self.staging_area, f"metadata.parquet"))
        except Exception as error:
            self.logger.error(f"Error while extracting metadata: \n{error}")

        mount_object = self.s3.list_objects(Bucket=BUCKET_NAME, Prefix=MOUNT_PREFIX, Delimiter="/")
        try:
            mount_key = mount_object["Contents"][0]["Key"]
            self.s3_download(mount_key, path.join(self.staging_area, f"mount.parquet"))
        except Exception as error:
            self.logger.error(f"Error while extracting mount data: \n{error}")


    def extract_metrics(self, ss_id: int) -> None:
        """
            Extracts Metrics given an ss_id
        """
        metrics_aws_path = METRICS_PREFIX.replace("{ss_id}", str(ss_id))
        metrics_object = self.s3.list_objects(Bucket=BUCKET_NAME, Prefix=metrics_aws_path, Delimiter="/")
        try:
            metrics_key = metrics_object["Contents"][0]["Key"]
            local_dir = path.join(self.staging_area, f"system_{ss_id}")
            makedirs(local_dir, exist_ok=True)
            self.s3_download(metrics_key, path.join(local_dir, f"metrics_system{ss_id}.parquet"))
        except Exception as error:
            self.logger.error(f"Error while extracting metrics for Site {ss_id}: \n{error}")


    def extract_pv_data(self, ss_id: int, date: datetime) -> None:
        """
            Extracts PV data given an ss_id and date
        """
        pv_aws_path = PV_PREFIX.replace("{ss_id}", str(ss_id)).replace("{year}", str(date.year)).replace("{month}", str(date.month)).replace("{day}", str(date.day))
        pv_object = self.s3.list_objects(Bucket=BUCKET_NAME, Prefix=pv_aws_path, Delimiter="/")
        try:
            pv_data_key = pv_object["Contents"][0]["Key"]
            local_dir = path.join(self.staging_area, f"system_{ss_id}", "pv_data")
            makedirs(local_dir, exist_ok=True)
            self.s3_download(pv_data_key, path.join(local_dir, f"pv_data_system{ss_id}_{date.strftime('%Y-%m-%d')}.parquet"))
        except Exception as error:
            self.logger.error(f"Error while extracting PV data for Site {ss_id} on {date}: \n {error}")


    def extract(self, ss_id: int, start_date: datetime, end_date: datetime) -> None:
        """
            Extracts PV data and associated metrics, metadata and mount data for a given ss_id and date
        """
        # create staging area if it does not exist
        makedirs(self.staging_area, exist_ok=True)

        # check if system metadata exists, if not extract
        if not path.isfile(path.join(self.staging_area, "metadata.parquet")):
            self.logger.info("Metadata is not available. Extracting from source...")
            self.extract_metadata()

        # check if metadata exists, if not extract
        if not path.isfile(path.join(self.staging_area, f"system_{ss_id}", f"metrics_system{ss_id}.parquet")):
            self.logger.info(f"Metrics for System {ss_id} are not available. Extracting from source...")
            self.extract_metrics(ss_id)
        
        # extract pv
        self.logger.info(f"Extracting PV data for System {ss_id} for dates: {start_date} to {end_date}")
        for date in tqdm(pd.date_range(start=start_date, end=end_date)):
            self.extract_pv_data(ss_id, date)

In [5]:
log = logging.getLogger(__name__)

extractor = PVExtract(
        aws_access_key_id="AKIA4MTWG33OOIEEML5D",
        aws_secret_access_key="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",
        region_name="us-west-2",
        staging_area="./data/",
        logger=log
    )

In [6]:
extractor.extract_metadata()

In [7]:
extractor.extract(10, datetime(2010, 1, 1), datetime(2010, 1, 2))

  0%|          | 0/2 [00:00<?, ?it/s]Error while extracting PV data for Site 10 on 2010-01-01 00:00:00: 
 'Contents'
Error while extracting PV data for Site 10 on 2010-01-02 00:00:00: 
 'Contents'
100%|██████████| 2/2 [00:00<00:00, 11.72it/s]


In [57]:
metadata_cols = ["system_id", "latitude", "longitude", "elevation", "av_pressure", "av_temp", "climate_type"]
metadata = pd.read_parquet("./staging_data/metadata.parquet", columns=metadata_cols)

mount_cols = ["system_id", "azimuth", "tilt"]
mount_data = pd.read_parquet("./staging_data/mount.parquet", columns=mount_cols)

In [58]:
metadata.head()

Unnamed: 0,system_id,latitude,longitude,elevation,av_pressure,av_temp,climate_type
0,1339,29.9745,-90.1563,10.0,1020.0,26.0,Cfa
1,1349,30.0017,-90.0926,10.0,1020.0,26.0,Cfa
2,1351,29.9809,-90.0023,10.0,1020.0,26.0,Cfa
3,1352,29.9809,-90.0023,10.0,1020.0,26.0,Cfa
4,1274,30.0017,-90.0926,10.0,1020.0,26.0,Cfa


In [59]:
mount_data.head()

Unnamed: 0,system_id,azimuth,tilt
0,1261,195.0,15.0
1,1328,,
2,1262,195.0,30.0
3,1260,195.0,30.0
4,1267,190.0,30.0


Unnamed: 0,system_id,latitude,longitude,elevation,av_pressure,av_temp,climate_type,azimuth,tilt
0,1339,29.9745,-90.1563,10.0,1020.0,26.0,Cfa,199.0,27.0
1,1349,30.0017,-90.0926,10.0,1020.0,26.0,Cfa,,
2,1351,29.9809,-90.0023,10.0,1020.0,26.0,Cfa,195.0,15.0
3,1352,29.9809,-90.0023,10.0,1020.0,26.0,Cfa,195.0,30.0
4,1274,30.0017,-90.0926,10.0,1020.0,26.0,Cfa,180.0,15.0
...,...,...,...,...,...,...,...,...,...
153,1218,40.995,-73.668,75.0,1020.0,11.0,Cfa,180.0,5.0
154,1224,40.995,-73.668,75.0,1020.0,11.0,Cfa,180.0,10.0
155,1429,35.0549,-106.5433,1658.0,820.0,12.5,Bsk,180.0,35.0
156,1403,28.405,-80.7709,12.0,820.0,22.0,Cfa,180.0,35.0
