In [None]:
import os
import sys
from dotenv import load_dotenv

# sklearn
from sklearn.linear_model import LogisticRegression                                 # type: ignore
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier     # type: ignore
from sklearn.tree import DecisionTreeClassifier                                     # type: ignore                            
from sklearn.svm import SVC, LinearSVC                                              # type: ignore
from sklearn.model_selection import KFold, StratifiedKFold                          # type: ignore
from sklearn.metrics import f1_score                                                # type: ignore

import numpy as np
import pandas as pd
import sqlalchemy as sq

sys.path.append("../")
from Shared.DataService import DataService

In [None]:
# function to update logs
def updateLog(fileName: str, message: str) -> None:
    try:
        if fileName is not None:
            with open(fileName, "a") as log:
                log.write(message + "\n")
    except Exception as e:
        print(message)


LOG_FILE = "/data/pull_moisture.log"
load_dotenv()
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")

if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    updateLog(LOG_FILE, "Missing database credentials")
    raise ValueError("Environment variables are not set")
else:
    # connecting to database
    db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
    conn = db.connect()

In [None]:
model_dict = {
    "logistic_regression": LogisticRegression,
    "random_forest": RandomForestClassifier,
    "decision_tree": DecisionTreeClassifier,
    "gradient_boost": GradientBoostingClassifier,
    # "svc": SVC,
    # "linear_svc": LinearSVC,
}


def model_initializer(model_type: str, random_state: int = 42, max_depth: int = 2):
    # if network then initialize the network model differently (passing X-train, xtest, ytest ...)
    # currently haven't had any networks.
    if model_type == "random_forest":
        model = model_dict[model_type](random_state=random_state, max_depth=max_depth)
    else:
        model = model_dict[model_type](random_state=random_state)
    return model


# def evaluate_model(dataset, arg, model_type):
#     f1_score = 0

## Pulling data (only ergot and soil moisture rn)

In [None]:
# pulling ergot data
query = sq.text("select * FROM public.agg_ergot_samples")
ergot_df = (
    pd.read_sql(query, conn)
    .drop(columns=["sample_id"])
    .drop_duplicates()[["year", "province", "district", "incidence"]]
)
ergot_df

In [None]:
# pulling soil moisture data
query = sq.text("select * FROM public.agg_soil_moisture")
sm_df = (
    pd.read_sql(query, conn).drop(columns=["index", "month", "day"]).drop_duplicates()
)
sm_df

In [None]:
df = pd.merge(sm_df, ergot_df, how="inner", on=["year", "district"])

features = ["soil_moisture_min", "soil_moisture_max", "soil_moisture_mean"]

X = np.array(df[features]).squeeze()
y = np.array(df[["incidence"]]).squeeze()
print("X data: ", X)
print("X shape: {} \n".format(X.shape))
print("y data: ", y)
print("y shape: ", y.shape)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)

## K-Fold Cross Validation

In [None]:
kf = KFold(
    n_splits=5
)  # we need to modify it to make sure the outliers dont fall into 1 bin

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}: ")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    for model_type in model_dict:
        model = model_initializer(model_type).fit(X_train, y_train.squeeze())
        y_pred = model.predict(X_test)
        print(
            f"model type: {model_type}, and its r^2 score is : {model.score(X_test, y_test)} and f1_score is: {f1_score(y_test, y_pred)}"
        )
    print(f"End the fold {i}\n")

## Leave one out cross validation

## Stratified KFold

KFold validation does not preserve the split of the output variable while splitting the data. For example, it is possible that if we have ten samples where 5 of them has incidence = True, and 5 of them has incidence = False, KFold can randomly put all positive (incidence = True) in 1 bin and all negative in another bin. To avoid that, we can use Stratified KFold - preserve the split in the original dataset in training dataset.

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    for model_type in model_dict:
        model = model_initializer(model_type).fit(X_train, y_train.squeeze())
        y_pred = model.predict(X_test)
        print(
            f"   model type: {model_type}, and its r^2 score is : {model.score(X_test, y_test)} and f1_score is: {f1_score(y_test, y_pred)}"
        )
    print(f"End the fold {i} \n")