In [None]:
import sys
import os
from dotenv import load_dotenv

import numpy as np
import pandas as pd
import sqlalchemy as sq
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

sys.path.append("../Shared/")
from DataService import DataService

# fetch data from db

In [None]:
# function to update logs
def updateLog(fileName: str, message: str) -> None:
    try:
        if fileName is not None:
            with open(fileName, "a") as log:
                log.write(message + "\n")
    except Exception as e:
        print(message)

In [None]:
LOG_FILE = "/data/pull_moisture.log"

load_dotenv()
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")

In [None]:
if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    updateLog(LOG_FILE, "Missing database credentials")
    raise ValueError("Environment variables are not set")
else:
    # connicting to database
    db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
    conn = db.connect()

In [None]:
# pulling soil moisture data
query = sq.text("select * FROM public.agg_soil_moisture")
sm_df = pd.read_sql(query, conn)

In [None]:
sm_df.head()

In [None]:
# pull ergot data
query = sq.text("select * FROM public.agg_ergot_samples")
ergot_df = pd.read_sql(query, conn)

In [None]:
ergot_df.shape

In [None]:
ergot_df.drop(columns=["sample_id"], inplace=True)

In [None]:
ergot_df = ergot_df.drop_duplicates()

In [None]:
ergot_df

In [None]:
# pull weather data

In [None]:
# joining tables
# df = pd.merge(sm_df, ergot_df, how="inner", on=["year", "district"])
df = pd.merge(sm_df, ergot_df)

In [None]:
df.shape

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# using one hot encoding to change catagorical variable to numarical
df = pd.get_dummies(df, columns=["province"], drop_first=True)

In [None]:
df

# split data

In [None]:
X = df.drop(columns=["has_ergot"])
y = df['has_ergot']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

In [None]:
# use StandardScaler when we don't know upper bounds
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# tensorflow.config.set_visible_devices([], 'GPU')  # Hide GPU devices
# tensorflow.config.set_visible_devices(tensorflow.config.list_physical_devices('CPU'), 'CPU')  # Show CPU devices


# creating model

In [None]:
model = Sequential()

model.add(Dense(14, activation="relu", input_dim = 24))
# model.add(Dense(40, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

In [None]:
model.summary()

In [None]:
# compile model
model.compile(loss="binary_crossentropy", optimizer="Adam", metrics=["accuracy"])

In [None]:
history = model.fit(X_train_scaled, y_train, epochs=10, validation_split=0.2)

In [None]:
y_pred = model.predict(X_test_scaled)

In [None]:
y_pred = np.where(y_pred > 0.5, True, False)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Val Loss")
plt.legend()

In [None]:
plt.plot(history.history["accuracy"], label="Training Accuracy")
plt.plot(history.history["val_accuracy"], label="Val Accuracy")
plt.legend()