In [59]:
!pip install pandas matplotlib numpy scikit-learn tensorflow keras


# Import libraries
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os
import IPython
import IPython.display
import tensorflow as tf
from keras.models import Model
import keras.layers as kl
import keras.activations as ka
import gc
import warnings
warnings.filterwarnings('ignore')


# Intel specific imports
from openvino.inference_engine import IECore
from daal4py import linear_regression_training, linear_regression_prediction
import mkl
import oneapi
import vtune





ModuleNotFoundError: No module named 'openvino'

In [4]:
random_seed = 42
SKIP_TIMESTEPS = 20
FORECAST_WINDOW = 20
FORECAST_SHIFT = 10
CONV_WIDTH = 5
TARGET_LABELS = ["ph", "temperature", "disolved_oxg"]


In [5]:
random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

In [6]:
base_path = "D:/Aquaponics dataset"
dataset_folder = "Cleaned pond dataset"
f_path = os.path.join(base_path, dataset_folder)
ponds = os.listdir(f_path)[:]
print(ponds)

['IoTpond1.csv', 'IoTPond2.csv', 'IoTPond3.csv', 'IoTPond4.csv']


In [34]:
import pandas as pd
import IPython

def load_correct_data(ponds, skip_timesteps):
    data = []
    date_times = []
    used_ponds = []
    unused_ponds = []

    # Loading data
    for pond in ponds:
        try:
            df = pd.read_csv(f_path + pond)
            df = df[::skip_timesteps]
            df["created_at"] = pd.to_datetime(df["created_at"], format="%Y-%m-%d %H:%M:%S")
            date_time1 = df.pop("created_at")
            df.drop(columns=['population', 'entry_id'], inplace=True)
            df.fillna(df.mean(), inplace=True)
            df.drop(columns=[col for col in df.columns if 'Unnamed' in col], inplace=True)

            data.append(df)
            date_times.append(date_time1)
            used_ponds.append(pond)
            IPython.display.clear_output()
        except Exception as e:
            print(f"ERROR at POND: {pond} - {e}")
            unused_ponds.append(pond)

    # Correcting data
    for df in data:
        df['temperature'] = df['temperature'].clip(lower=20)
        df['ph'] = df['ph'].clip(lower=5, upper=12)
        df['ammonia'] = df['ammonia'].clip(upper=10)
        df['nitrate'] = df['nitrate'].clip(upper=2000)
        IPython.display.clear_output()

    return data, date_times, used_ponds, unused_ponds


In [36]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def standardize_normalize(df, method='standardize'):
    df_cols = df.columns
    
    if method == 'standardize':
        scaler = StandardScaler()
    elif method == 'normalize':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Method must be either 'standardize' or 'normalize'")
    
    df_scaled = scaler.fit_transform(df)
    df_scaled = pd.DataFrame(df_scaled, columns=df_cols)
    
    return df_scaled, scaler

In [38]:
def compute_metrics(df):
    """
    Compute mean, standard deviation, maximum, and minimum for each column in the DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    tuple: A tuple containing four DataFrames (mean, std, max, min).
    """
    df_mean = df.mean()
    df_std = df.std()
    df_max = df.max()
    df_min = df.min()
    
    return df_mean, df_std, df_max, df_min

In [40]:

def destandardize_denormalize(df, transformations):
    """
    Apply inverse transformations to de-standardize and de-normalize the DataFrame.

    Parameters:
    df (pd.DataFrame): The standardized and normalized DataFrame.
    transformations (tuple): A tuple containing the normalizer and standardizer.

    Returns:
    pd.DataFrame: The original DataFrame after inverse transformations.
    """
    normalizer, standardizer = transformations
    df_cols = df.columns
    
    # Apply inverse transformations in the correct order
    df = standardizer.inverse_transform(df)
    df = normalizer.inverse_transform(df)
    
    # Convert back to DataFrame
    df = pd.DataFrame(df, columns=df_cols)

    return df

In [42]:
import matplotlib.pyplot as plt
import numpy as np

def visualize_feature(feature_idx, data, date_times, used_ponds):
    """
    Visualize a specific feature across multiple ponds with percentile thresholds.

    Parameters:
    feature_idx (int): Index of the feature to visualize.
    data (list of pd.DataFrame): List of DataFrames containing pond data.
    date_times (list of pd.Series): List of date-time Series corresponding to each pond.
    used_ponds (list of str): List of pond names.

    Returns:
    None
    """
    num_ponds = len(data)
    nrows = (num_ponds + 1) // 2
    fig, axs = plt.subplots(nrows=nrows, ncols=2, figsize=(16, 5 * nrows))

    for i, df in enumerate(data):
        row_idx = i // 2
        col_idx = i % 2

        test_feature = df.columns[feature_idx]
        thresholds = [np.percentile(df[test_feature], p) for p in [20, 50, 75, 90]]
        colors = ["black", "green", "yellow", "red"]

        axs[row_idx, col_idx].plot(date_times[i], df[test_feature])
        for threshold, color in zip(thresholds, colors):
            axs[row_idx, col_idx].axhline(threshold, color=color)
        axs[row_idx, col_idx].set_title(f"{used_ponds[i]} [{thresholds[0]:.2f} - {thresholds[3]:.2f}]")

    fig.suptitle(test_feature)
    plt.tight_layout()
    plt.show()

In [44]:
def visualize_df(df_idx, data, date_times):
    df = data[df_idx]
    date_time = date_times[df_idx]
    plot_cols = df.columns

    colors = ['blue', 'red', 'green', 'yellow', 'purple', 'orange', 'cyan', 'magenta']

    plot_features = df[plot_cols]
    plot_features.index = date_time

    fig, axs = plt.subplots(ncols=2, nrows=4, figsize=(15, 10))
    fig.subplots_adjust(wspace=0.1, hspace=0.2)
    for i, col in enumerate(plot_features):
        row_idx = i // 2
        col_idx = i % 2
        axs[row_idx, col_idx].plot(plot_features[col], color=colors[i], label=col)
        axs[row_idx, col_idx].tick_params(axis='x', labelsize=7)
        axs[row_idx, col_idx].legend()

    plt.show()

In [46]:
def plot_history(history):
    """
    Plot the training and evaluation loss history.

    Parameters:
    history (dict): Dictionary containing 'loss' and 'val_loss' keys with their respective values.

    Returns:
    None
    """
    if 'loss' not in history or 'val_loss' not in history:
        raise ValueError("The history dictionary must contain 'loss' and 'val_loss' keys.")
    
    plt.figure(figsize=(10, 6))
    plt.plot(history['loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Evaluation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.title("Training and Evaluation Loss Over Epochs")
    plt.grid(True)
    plt.show()

In [50]:
def split_data(data, train_ratio=0.8, val_ratio=0.9):
    """
    Split data into training, validation, and test sets.

    Parameters:
    data (list of pd.DataFrame): List of DataFrames to be split.
    train_ratio (float): Proportion of data to be used for training.
    val_ratio (float): Proportion of data to be used for validation.

    Returns:
    tuple: Three lists containing the training, validation, and test DataFrames.
    """
    if not (0 < train_ratio < 1) or not (0 < val_ratio < 1):
        raise ValueError("train_ratio and val_ratio must be between 0 and 1.")
    if train_ratio >= val_ratio:
        raise ValueError("train_ratio must be less than val_ratio.")

    train_data = []
    val_data = []
    test_data = []
    
    for df in data:
        n = len(df)
        train_end = int(n * train_ratio)
        val_end = int(n * val_ratio)
        
        train_data.append(df[:train_end])
        val_data.append(df[train_end:val_end])
        test_data.append(df[val_end:])
    
    return train_data, val_data, test_data


In [61]:
#from TensorFlow
class WindowGenerator():
    def __init__(self, input_width, label_width, shift,
                 train_data, val_data, test_data,
                 label_columns=None):
        # Store the raw data.
        self.train_data = train_data
        self.val_data = val_data
        self.test_data = test_data

        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
        self.column_indices = {name: i for i, name in enumerate(train_data[0].columns)}

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])

    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.labels_slice, :]
        if self.label_columns is not None:
            labels = tf.stack(
                [labels[:, :, self.column_indices[name]] for name in self.label_columns],
                axis=-1)

        # Slicing doesn't preserve static shape information, so set the shapes manually.
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])

        return inputs, labels

    def plot(self, model=None, plot_col=None, max_subplots=3):
        if plot_col is None:
            raise ValueError("plot_col must be specified.")
        inputs, labels = self.example
        plt.figure(figsize=(12, 8))
        plot_col_index = self.column_indices[plot_col]
        max_n = min(max_subplots, len(inputs))
        for n in range(max_n):
            plt.subplot(max_n, 1, n+1)
            plt.ylabel(f'{plot_col} [normed]')
            plt.plot(self.input_indices, inputs[n, :, plot_col_index],
                     label='Inputs', marker='.', zorder=-10)

            if self.label_columns:
                label_col_index = self.label_columns_indices.get(plot_col, None)
            else:
                label_col_index = plot_col_index

            if label_col_index is None:
                continue

            if model is not None:
                predictions = model(inputs)
                plt.scatter(self.label_indices, predictions[n, :, label_col_index],
                            marker='X', edgecolors='k', label='Predictions',
                            c='#ff7f0e', s=64)
            plt.scatter(self.label_indices, labels[n, :, label_col_index],
                        edgecolors='k', label='Labels', c='#2ca02c', s=64)

            if n == 0:
                plt.legend()

        plt.xlabel(plot_col)
        plt.show()

    def make_dataset(self, data):
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.utils.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=True,
            batch_size=32)

        ds = ds.map(self.split_window)

        return ds

    @property
    def train(self):
        full_dataset = self.make_dataset(self.train_data[0])
        for i in range(1, len(self.train_data)):
            full_dataset = full_dataset.concatenate(self.make_dataset(self.train_data[i]))

        return full_dataset

    @property
    def test(self):
        """
        Create and return the test dataset.
        """
        full_dataset = self.make_dataset(self.test_data[0])
        for i in range(1, len(self.test_data)):
            full_dataset = full_dataset.concatenate(self.make_dataset(self.test_data[i]))

        return full_dataset

    @property
    def example(self):
        """
        Get and cache an example batch of `inputs, labels` for plotting.
        """
        result = getattr(self, '_example', None)
        if result is None:
            # No example batch was found, so get one from the `.train` dataset
            result = next(iter(self.train))
      # And cache it for next time
            self._example = result
        return result

In [75]:
data, date_times, used_ponds, unused_ponds = load_correct_data()

TypeError: load_correct_data() missing 2 required positional arguments: 'ponds' and 'skip_timesteps'

In [73]:
print(used_ponds)

[]


In [None]:
data[2].head()

In [None]:
#normalizing and standarizing the data based on whole dataset
full_data = pd.concat(data)
full_data.describe().transpose()

In [None]:
full_data, normalizer, standarizer = standarize_normalize(full_data)
full_data.describe().transpose()

In [None]:
for i in range(len(data)):
    df_cols = data[i].columns
    data[i] = normalizer.transform(data[i])
    data[i] = standarizer.transform(data[i])

    data[i] = pd.DataFrame(data[i], columns=df_cols)

In [None]:
data[0].head()

In [None]:
#visualizing a dataframe
visualize_df(3, data, date_times)

In [None]:
#splitting the dataset into train, val, test sets
train_data, val_data, test_data = split_data(data)

In [None]:
column_indices = {name: i for i, name in enumerate(data[0].columns)}
column_indices

In [None]:
del data
del date_times
gc.collect()