In [14]:
import os
import sys
from dotenv import load_dotenv

# sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import f1_score

import numpy as np
import pandas as pd
import sqlalchemy as sq

sys.path.append("../")
from Shared.DataService import DataService


In [2]:
# function to update logs
def updateLog(fileName: str, message: str) -> None:
    try:
        if fileName is not None:
            with open(fileName, "a") as log:
                log.write(message + "\n")
    except Exception as e:
        print(message)

LOG_FILE = "/data/pull_moisture.log"
load_dotenv()
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")

if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    updateLog(LOG_FILE, "Missing database credentials")
    raise ValueError("Environment variables are not set")
else:
    # connecting to database
    db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
    conn = db.connect()

In [3]:
model_dict = {
    "logistic_regression": LogisticRegression,
    "random_forest": RandomForestClassifier,
    "decision_tree": DecisionTreeClassifier,
    "gradient_boost": GradientBoostingClassifier,
    #"svc": SVC,
    #"linear_svc": LinearSVC,
}

def model_initializer(model_type:str, random_state:int=42 , max_depth:int=2):
    # if network then initialize the network model differently (passing X-train, xtest, ytest ...)
    # currently haven't had any networks.
    if model_type == 'random_forest':
        model = model_dict[model_type](random_state=random_state, max_depth=max_depth)
    else: 
        model = model_dict[model_type](random_state=random_state)
    return model

# def evaluate_model(dataset, arg, model_type):
#     f1_score = 0

## Pulling data (only ergot and soil moisture rn)

In [4]:
# pulling ergot data
query = sq.text("select * FROM public.agg_ergot_samples")
ergot_df = pd.read_sql(query, conn).drop(columns=['sample_id']).drop_duplicates()[['year', 'province', 'district', 'incidence']]
ergot_df

Unnamed: 0,year,province,district,incidence
0,1995,AB,4810,False
48,1995,AB,4820,False
190,1995,AB,4830,False
230,1995,AB,4840,False
280,1995,AB,4840,True
...,...,...,...,...
158540,2022,AB,4830,True
158636,2022,SK,4771,True
158655,2022,MB,4603,True
158660,2022,SK,4740,True


In [5]:
# pulling soil moisture data
query = sq.text("select * FROM public.agg_soil_moisture")
sm_df = pd.read_sql(query, conn).drop(columns=['index', 'month', 'day']).drop_duplicates()
sm_df

Unnamed: 0,year,cr_num,district,soil_moisture_min,soil_moisture_max,soil_moisture_mean
0,1978,0,4612,0.165900,0.418710,0.237053
1,1978,5,4740,0.127140,0.207248,0.163722
2,1978,5,4741,0.153398,0.215304,0.170989
3,1978,7,4770,0.140772,0.186390,0.152211
4,1978,7,4771,0.116810,0.116810,0.116810
...,...,...,...,...,...,...
117216,2017,9,4840,0.191339,0.271052,0.220148
117217,2017,9,4840,0.251481,0.251481,0.251481
117218,2017,9,4840,0.193165,0.267507,0.223290
117219,2017,9,4840,0.191507,0.243024,0.208214


In [6]:
df = pd.merge(sm_df, ergot_df, how="inner", on=["year", "district"])

features = ['soil_moisture_min', 'soil_moisture_max', 'soil_moisture_mean']

X = np.array(df[features]).squeeze()
y = np.array(df[['incidence']]).squeeze()
print('X data: ', X)
print('X shape: {} \n'.format(X.shape))
print('y data: ', y)
print('y shape: ', y.shape)


X data:  [[0.15443027 0.15443027 0.15443027]
 [0.16073954 0.20681489 0.18377721]
 [0.16988319 0.23791865 0.21258988]
 ...
 [0.22753887 0.26947325 0.24850606]
 [0.22753887 0.26947325 0.24850606]
 [0.22753887 0.26947325 0.24850606]]
X shape: (353329, 3) 

y data:  [False False False ...  True  True  True]
y shape:  (353329,)


In [7]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)

## K-Fold Cross Validation

In [18]:
kf = KFold(n_splits=5) # we need to modify it to make sure the outliers dont fall into 1 bin

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f'Fold {i}: ')
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    for model_type in model_dict:
        model = model_initializer(model_type).fit(X_train, y_train.squeeze())
        y_pred = model.predict(X_test)
        print(f'model type: {model_type}, and its r^2 score is : {model.score(X_test, y_test)} and f1_score is: {f1_score(y_test, y_pred)}')
    print(f'End the fold {i}\n')


Fold 0: 
model type: logistic_regression, and its r^2 score is : 0.6834970141227747 and f1_score is: 0.8119967049409075
model type: random_forest, and its r^2 score is : 0.6834970141227747 and f1_score is: 0.8119967049409075
model type: decision_tree, and its r^2 score is : 0.6668694987688563 and f1_score is: 0.7941878458834946
model type: gradient_boost, and its r^2 score is : 0.6835253162765687 and f1_score is: 0.8120103560741064
End the fold 0

Fold 1: 
model type: logistic_regression, and its r^2 score is : 0.8544278719610562 and f1_score is: 0.9214906623724519
model type: random_forest, and its r^2 score is : 0.8548382531910679 and f1_score is: 0.9217388650685873
model type: decision_tree, and its r^2 score is : 0.6842045679676223 and f1_score is: 0.8043794596679466
model type: gradient_boost, and its r^2 score is : 0.8535788073472391 and f1_score is: 0.9209507001902317
End the fold 1

Fold 2: 
model type: logistic_regression, and its r^2 score is : 0.8394277304502873 and f1_score

## Leave one out cross validation

## Stratified KFold

KFold validation does not preserve the split of the output variable while splitting the data. For example, it is possible that if we have ten samples where 5 of them has incidence = True, and 5 of them has incidence = False, KFold can randomly put all positive (incidence = True) in 1 bin and all negative in another bin. To avoid that, we can use Stratified KFold - preserve the split in the original dataset in training dataset.

In [17]:
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    for model_type in model_dict:
        model = model_initializer(model_type).fit(X_train, y_train.squeeze())
        y_pred = model.predict(X_test)
        print(f'   model type: {model_type}, and its r^2 score is : {model.score(X_test, y_test)} and f1_score is: {f1_score(y_test, y_pred)}')
    print(f'End the fold {i} \n')


Fold 0:
  Train: index=[ 37686  37687  37688 ... 353326 353327 353328]
  Test:  index=[    0     1     2 ... 82284 82285 82286]
model type: logistic_regression, and its r^2 score is : 0.8201398126397419 and f1_score is: 0.9011833123415901
model type: random_forest, and its r^2 score is : 0.8201398126397419 and f1_score is: 0.9011833123415901
model type: decision_tree, and its r^2 score is : 0.2918659609996321 and f1_score is: 0.3847392817183677
model type: gradient_boost, and its r^2 score is : 0.8196303738714517 and f1_score is: 0.9008216875719754
End the fold 0 

Fold 1:
  Train: index=[     0      1      2 ... 353326 353327 353328]
  Test:  index=[ 37686  37687  37688 ... 149073 149074 149075]
model type: logistic_regression, and its r^2 score is : 0.8198426400249059 and f1_score is: 0.9010038802186608
model type: random_forest, and its r^2 score is : 0.8201539637166388 and f1_score is: 0.9011918552669429
model type: decision_tree, and its r^2 score is : 0.08949141029632356 and f1_s