# featureEngErgot.ipynb

Feature Engineering for ergot and agg_ergot by creating additional columns and processing them.

In [None]:
import sqlalchemy as sq
import geopandas as gpd  # type: ignore
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os, sys, calendar
from aggregateErgot import calcUIDs  # type: ignore

sys.path.append("../")
from Shared.DataService import DataService
from Shared.GenericQueryBuilder import GenericQueryBuilder

Psuedocode:  
- Load the environment database variables
- Create the ergot data SQL query
- Create the agg_ergot data SQL query
- [Load the data from the database directly into a DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.read_sql.html) 

In [None]:
load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    raise ValueError("Environment variables not set")

In [None]:
def pullIndividualErgotSampleData(conn: sq.engine.Connection) -> pd.DataFrame:
    # pulling weather station data from the database
    weatherDataQuery = sq.text(
        """
        SELECT * FROM public.ergot_sample
        """
    )

    return pd.read_sql(weatherDataQuery, conn)

In [None]:
def pullAggErgotData(conn: sq.engine.Connection) -> pd.DataFrame:
    # pulling weather station data from the database
    weatherDataQuery = sq.text(
        """
        SELECT * FROM public.agg_ergot_sample_v2
        """
    )

    return pd.read_sql(weatherDataQuery, conn)

Purpose :
- Create a table containg relevant attributes.

In [None]:
def createErgotFeatEngTableV1(db, tablename: str):
    query = sq.text(
        f"""
        CREATE TABLE {tablename} (
            year                        INT,
            province                    VARCHAR(2),
            crop_district               INT,
            incidence                   BOOL,
            severity                    FLOAT,
            district                    INT,
            downgrade                   BOOL,
            severity_bin_quan           INT,
            severity_bin_arb            INT,

            CONSTRAINT PK_{tablename} PRIMARY KEY(year, district)
        );
        COMMIT;
        """
    )

    db.execute(query)

Purpose:
- The purpose of this function is to add a new column called "downgrade" to the input DataFrame, indicating whether each district is considered a "downgrade" district based on its severity level

In [None]:
def calculateDowngradeColumn(df: pd.DataFrame) -> pd.DataFrame:
    DOWNGRADE_THRESHOLD = 0.04
    df["downgrade"] = False
    df.loc[df["severity"] >= DOWNGRADE_THRESHOLD, "downgrade"] = True
    return df

Purpose:
- The purpose of this function is to add a new column named "severity_bin_quan" to the input DataFrame, which represents the quantile bin numbers of severity levels for each district.

In [None]:
def calculateSeverityBinQuan(df: pd.DataFrame) -> pd.DataFrame:
    # quantiles only on severities > 0
    df["severity_bin_quan"] = 0
    df.loc[df["severity"] > 0, "severity_bin_quan"] = pd.qcut(
        df.loc[df["severity"] > 0]["severity"], 4, labels=False
    )
    return df

Purpose:
- The purpose of this function is to add a new column named "severity_bin_arb" to the input DataFrame df, which represents the bin numbers of severity levels for each district based on arbitrary threshold values. 

In [None]:
def calculateSeverityBinArbitrary(df: pd.DataFrame) -> pd.DataFrame:
    df["severity_bin_arb"] = 0
    df.loc[df["severity"] >= 0.02, "severity_bin_arb"] = 1
    df.loc[df["severity"] >= 0.04, "severity_bin_arb"] = 2
    df.loc[df["severity"] >= 0.08, "severity_bin_arb"] = 3
    return df

Purpose:
- The purpose of the code snippet is to process ergot sample data, calculate additional columns ("downgrade," "severity_bin_quan," "severity_bin_arb"), and store the processed data in a database table. 

In [None]:
def main():
    TABLENAME = "ergot_sample_feat_eng"

    db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
    conn = db.connect()

    ergotDf = pullIndividualErgotSampleData(conn)
    ergotDf = calcUIDs(ergotDf)
    ergotDf = calculateDowngradeColumn(ergotDf)
    ergotDf = calculateSeverityBinQuan(ergotDf)
    ergotDf = calculateSeverityBinArbitrary(ergotDf)

    try:
        queryBuilder = GenericQueryBuilder()
        request = sq.text(queryBuilder.tableExistsReq(TABLENAME))
        tableExists = queryBuilder.readTableExists(db.execute(request))

        if not tableExists:
            createErgotFeatEngTableV1(db)

        ergotDf.to_sql(
            TABLENAME, conn, schema="public", if_exists="replace", index=False
        )
    except Exception as e:
        print("An error occurred while writing to the database {}".format(e))
        raise e

    db.cleanup()

In [None]:
if __name__ == "__main__":
    main()