In [None]:
import sys
import os
from dotenv import load_dotenv

import numpy as np
import pandas as pd
import sqlalchemy as sq
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc

import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

sys.path.append("../Shared/")
from DataService import DataService

# fetch data from db

In [None]:
# function to update logs
def updateLog(fileName: str, message: str) -> None:
    try:
        if fileName is not None:
            with open(fileName, "a") as log:
                log.write(message + "\n")
    except Exception as e:
        print(message)

In [None]:
LOG_FILE = "/data/pull_moisture.log"

load_dotenv()
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")

In [None]:
if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    updateLog(LOG_FILE, "Missing database credentials")
    raise ValueError("Environment variables are not set")
else:
    # connicting to database
    db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
    conn = db.connect()

In [None]:
# pulling soil moisture data
query = sq.text("select * FROM public.agg_soil_moisture")
sm_df = pd.read_sql(query, conn)

In [None]:
sm_df.drop(columns=["index", "cr_num", "month", "day"], inplace=True)
sm_df.head()
# sm_df.duplicated().sum() # 0
# print(sm_df.shape) #117221

In [None]:
# sm_df = sm_df.drop_duplicates()

In [None]:
sm_df.shape

In [None]:
# pull ergot data
query = sq.text("select * FROM public.agg_ergot_samples")
ergot_df = pd.read_sql(query, conn)

In [None]:
ergot_df.shape

In [None]:
ergot_df.sample(5)

In [None]:
ergot_df["has_ergot"].value_counts()

In [None]:
ergot_df.drop(columns=["sum_severity", "present_in_neighbor", "severity_in_neighbor", "percnt_true"], inplace=True)

In [None]:
ergot_df.duplicated().sum()

In [None]:
ergot_df = ergot_df.drop_duplicates()

In [None]:
ergot_df.sample(5)

In [None]:
ergot_df.shape

In [None]:
# pull weather data
# in case reading csv
weather_df = pd.read_csv("data/aggregatedDly.csv")
weather_df.shape

In [None]:
weather_df

In [None]:
weather_df.drop(weather_df.columns[0], axis=1, inplace=True)

In [None]:
all_col = weather_df.columns.tolist()
uni_col = set()
for i in range (2,len(all_col)):
    # print(all_col[i].split(":")[1])
    split_attr_name = all_col[i].split("_")
    if "mean" in split_attr_name[0]:
        uni_col.add(all_col[i].split(":")[1])

In [None]:
uni_col

In [None]:
new_weather_df = pd.DataFrame()
new_weather_df["year"] = weather_df["year"]
new_weather_df["district"] = weather_df["district"]
for col_name in uni_col:
    # col_name = "max_temp"
    fil_col = weather_df.filter(like=col_name)
    new_weather_df[col_name] = fil_col.mean(axis=1)
    # weather_df.drop(columns=fil_col.columns, inplace=True)


In [None]:
weather_df.head()

In [None]:
new_weather_df

In [None]:
df = pd.merge(ergot_df, weather_df,  on=["year", "district"], how="left")

In [None]:
df.isna().sum()

In [None]:
df.interpolate(method="linear", limit_direction="both", inplace=True)

In [None]:
df = pd.merge(df, sm_df, on=["year", "district"], how="left")

In [None]:
df.isna().sum()

In [None]:
df.interpolate(method="linear", limit_direction="both", inplace=True)

In [None]:
df.shape

In [None]:
# # joining tables
# # df = pd.merge(sm_df, ergot_df, how="inner", on=["year", "district"])
# df = pd.merge(sm_df, ergot_df)

In [None]:
# df.shape

In [None]:
# df = pd.merge(df, weather_df, on=["year", "district"])

In [None]:
# df.shape

In [None]:
df.info()

In [None]:
df

In [None]:
# temp = pd.merge(ergot_df, weather_df)
# temp.columns

In [None]:
# g = sns.pairplot(temp.sample(1000))

In [None]:
correlation_matrix = df.corr()
# Plot the correlation matrix as a heatmap
plt.figure(figsize=(30, 25))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True)
plt.title('Correlation Plot')
plt.show()

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(df, 0.8)
len(set(corr_features))
corr_features

In [None]:
# df.drop(columns=corr_features, inplace=True)

In [None]:
# using one hot encoding to change catagorical variable to numarical
# df = pd.get_dummies(df, columns=["province"], drop_first=True)

In [None]:
df.drop(columns=["year", "district"], inplace=True)

In [None]:
df.columns

# split data

In [None]:
X = df.drop(columns=["has_ergot"])
y = df['has_ergot']
oversampler = RandomOverSampler(random_state=42)
# y = df['has_ergot'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)
# print(y_train.value_count())
# X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [None]:
X_train.columns

In [None]:
# use StandardScaler when we don't know upper bounds
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled.shape

In [None]:
tensorflow.config.set_visible_devices([], 'GPU')  # Hide GPU devices
tensorflow.config.set_visible_devices(tensorflow.config.list_physical_devices('CPU'), 'CPU')  # Show CPU devices


# creating model

In [None]:
model = Sequential()

model.add(Dense(15, activation="relu", input_dim = X.shape[1]))
# model.add(Dense(4, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
# model.add(Dense(1, activation="sigmoid", input_dim = X.shape[1]))

In [None]:
model.summary()

In [None]:
# compile model
model.compile(loss="binary_crossentropy", optimizer="Adam", metrics=["accuracy"])

In [None]:
history = model.fit(X_train_scaled, y_train, epochs=50, validation_split=0.2, shuffle=True, batch_size=32)

In [None]:
model.weights

In [None]:
y_pred = model.predict(X_test_scaled)

In [None]:
threshhold = 0.8
y_pred = np.where(y_pred > threshhold, True, False)

# y_test = y_test.to_numpy()
# type(y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Val Loss")
plt.legend()

In [None]:
plt.plot(history.history["accuracy"], label="Training Accuracy")
plt.plot(history.history["val_accuracy"], label="Val Accuracy")
plt.legend()

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
fpr, tpr, t = roc_curve(y_test, y_pred)
auc_ = auc(fpr, tpr)
plt.plot(fpr, tpr, marker=".")
print(auc_)

In [None]:
# prints weight with attribute
attributes = X_train.columns.tolist()
weights = model.layers[0].get_weights()[0]
bias = model.layers[0].get_weights()[1]

result = {}

for x, y in zip(attributes, weights):
    result[x] = y

result = pd.DataFrame(result)

result.transpose()
print(result.max())