# Pre-trained CNN for behaviours detection

## Setups and constants

In [1]:
import tensorflow as tf

print(tf.__version__)

training_device: str = "cpu"

if str.lower(training_device) == "cpu":
    tf.config.set_visible_devices(
        devices=[],
        device_type="GPU",
    )
else:
    physical_devices = tf.config.experimental.list_physical_devices("GPU")
    assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
    config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

2023-12-08 01:07:09.278104: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-08 01:07:09.304528: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-08 01:07:09.304571: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-08 01:07:09.304602: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-08 01:07:09.310360: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-08 01:07:09.310902: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

2.14.0


In [2]:
INPUT_PATH: str = "/home/python/host/data/"
SCRIPT_PATH: str = "drive/MyDrive/Ellinbank/video_observation/custom_train_test_loop/data-labelling_conv1d-seq2seq-model/"
OUTPUT_PATH: str = "./"

## Data pre-processing

### Load the Dataset

In [3]:
import sqlite3

import pandas as pd


def load_data_sqlite(database: str) -> pd.DataFrame:
    """
    Read a sqlite file and applying queries to output a pandas DataFrame in
    the desired format. The desired format depends on the usage of the data.

    Parameters
    ----------
        database: str
            The path to the `.db` sqlite file.

    Returns
    -------
    pandas.DataFrame
        Return a DataFrame that suit the usage of subsequence operations.
    """
    connection: sqlite3.Connection = sqlite3.connect(database=database)
    cursor: sqlite3.Cursor = connection.cursor()

    DATABASE_NAME: str = "LebeledDataTable"

    query: str = """
    select distinct serial_number
    from {}
    """.format(
        DATABASE_NAME
    )

    response: sqlite3.Cursor = cursor.execute(query)
    file_names: list = list(response)
    file_names = [file[0] for file in file_names]

    dataframes: list = []
    for file in file_names:
        query: str = """
            select *
            from {}
            where serial_number = '{}'
        """.format(
            DATABASE_NAME,
            file,
        )
        df: pd.DataFrame = pd.read_sql_query(
            sql=query,
            con=connection,
        )
        dataframes.append(df)
        # break

    dataframe: pd.DataFrame = pd.concat(dataframes)
    return dataframe

In [4]:
import os

notebook_mode: int = int(
    input(
        """
    Select notebook mode: 
    1. Google Colab  2. Local
    """
    )
)
data_source: int = int(
    input(
        """
    Select data file type:
    1. SQLite   2. CSV
    """
    )
)

if notebook_mode == 1:
    os.system(command="cp {}custom_model.py .".format(SCRIPT_PATH))
    os.system(command="cp {}inference.py .".format(SCRIPT_PATH))
    os.system(command="cp {}utils.py .".format(SCRIPT_PATH))

if data_source == 1:
    # full_data: pd.DataFrame = load_data_sqlite()
    full_data: pd.DataFrame = pd.read_csv(
        filepath_or_buffer="{}MOS2E03230475_30Hz_proc.csv".format(INPUT_PATH),
        nrows=1000,
    )
elif data_source == 2:
    full_data: pd.DataFrame = pd.read_csv(
        filepath_or_buffer="{}MOS2E03230475_30Hz_proc.csv".format(INPUT_PATH),
        # filepath_or_buffer="{}MOS2E03230475_30Hz_proc.csv".format(INPUT_PATH),
        nrows=1000,
    )

### Check for duplicated rows and null values

In [5]:
duplicated_rows: pd.Series = full_data.duplicated()
duplicated_rows = duplicated_rows[duplicated_rows == True]
print("{} rows duplicated".format(duplicated_rows.shape))

full_data.drop_duplicates(inplace=True)

print(full_data.isnull().sum())
print(full_data.isna().sum())

(0,) rows duplicated
serial_number             0
nickname               1000
animalID               1000
sample_rate               0
timestamps                0
acc_axis1                 0
acc_axis2                 0
acc_axis3                 0
acc_mag                   0
lpf_axis1                 0
lpf_axis2                 0
lpf_axis3                 0
lpf_mag                   0
hpf_axis1                 0
hpf_axis2                 0
hpf_axis3                 0
hpf_mag                   0
spd_axis1                 0
spd_axis2                 0
spd_axis3                 0
spd_mag                   0
spd_axis1_denoised        0
spd_axis2_denoised        0
spd_axis3_denoised        0
spd_mag_denoised          0
dis_axis1                 0
dis_axis2                 0
dis_axis3                 0
dis_mag                   0
dis_axis1_denoised        0
dis_axis2_denoised        0
dis_axis3_denoised        0
dis_mag_denoised          0
tilt_axis1                0
tilt_axis2                0

In [6]:
full_data.head(1)

Unnamed: 0,serial_number,nickname,animalID,sample_rate,timestamps,acc_axis1,acc_axis2,acc_axis3,acc_mag,lpf_axis1,...,dis_axis1_denoised,dis_axis2_denoised,dis_axis3_denoised,dis_mag_denoised,tilt_axis1,tilt_axis2,tilt_axis3,tilt_axis1_denoised,tilt_axis2_denoised,tilt_axis3_denoised
0,MOS2E03230475,,,30,2023-04-14 18:01:01.000,0.645,0.309,0.715,1.011302,0.636648,...,1874.156185,1694.906462,192.005856,2534.173547,39.120814,18.021817,44.646077,39.105397,18.027755,44.634478


### Sort data based on date
This ensure that actvities are read in a consecutive order and reseting the index of readings (rows).

In [7]:
full_data.sort_values(
    by=["timestamps"],
    ascending=True,
    inplace=True,
)

full_data = full_data.reset_index(drop=True)

### Convert numerical values (input data) to float type

In [8]:
import gc

full_data_export: pd.DataFrame = full_data.copy()
full_data.iloc[:, 5:] = full_data.iloc[:, 5:].astype("float")
# train_df: pd.DataFrame = full_data.copy()

gc.collect()

0

### Keep only the timestamp and the data columns.

In [9]:
timestamp_col_index: int = full_data.columns.get_loc("timestamps")
print(timestamp_col_index)
kept_cols: list[str] = full_data.columns[timestamp_col_index:]
full_data = full_data[kept_cols]
full_data.head()

4


Unnamed: 0,timestamps,acc_axis1,acc_axis2,acc_axis3,acc_mag,lpf_axis1,lpf_axis2,lpf_axis3,lpf_mag,hpf_axis1,...,dis_axis1_denoised,dis_axis2_denoised,dis_axis3_denoised,dis_mag_denoised,tilt_axis1,tilt_axis2,tilt_axis3,tilt_axis1_denoised,tilt_axis2_denoised,tilt_axis3_denoised
0,2023-04-14 18:01:01.000,0.645,0.309,0.715,1.011302,0.636648,0.312169,0.717877,1.009018,0.009425,...,1874.156185,1694.906462,192.005856,2534.173547,39.120814,18.021817,44.646077,39.105397,18.027755,44.634478
1,2023-04-14 18:01:01.034,0.633,0.309,0.719,1.006544,0.636649,0.312171,0.717878,1.009019,-0.002577,...,1875.197894,1695.848197,192.113417,2535.581948,39.120815,18.021889,44.64612,39.105401,18.027766,44.634488
2,2023-04-14 18:01:01.067,0.629,0.313,0.715,1.002415,0.63665,0.312172,0.717878,1.009021,-0.00658,...,1876.239602,1696.789933,192.220978,2536.990348,39.120819,18.021961,44.646166,39.105405,18.027776,44.634498
3,2023-04-14 18:01:01.100,0.633,0.309,0.715,1.003691,0.636651,0.312174,0.717879,1.009022,-0.002581,...,1877.281309,1697.731668,192.328538,2538.398747,39.12082,18.022032,44.64621,39.105409,18.027787,44.634509
4,2023-04-14 18:01:01.134,0.633,0.309,0.719,1.006544,0.636651,0.312176,0.717879,1.009024,-0.002583,...,1878.323016,1698.673403,192.436099,2539.807146,39.120823,18.022104,44.646254,39.105413,18.027798,44.634519


### Standardize data.

In [10]:
from utils import standardize_dataframe

full_data = standardize_dataframe(
    data=full_data,
    std_cols=full_data.columns[1:],
)

gc.collect()

0

## Train-test setup.

### Hyper-parameters

In [11]:
import random

window_size: int = 600  # 300: 10 seconds
window_per_epoch: int = 200
epoch: int = 1
batch_size: int = 64
# random.seed(715) # 715 looks good.
random.seed(785)  # 785 makes "other" looks bad, otherwise is good.

### Output folder
Output files will be stored in the same folder named `out`.

In [12]:
# Check if the folder is already exists.
if os.path.isdir("{}out".format(OUTPUT_PATH)):
    pass
else:
    os.mkdir("{}out".format(OUTPUT_PATH))

## Train and validate the model.

### Train-test loop with k-fold cross validation.
Each fold will be selected as the validation set and an entire train-test loop will be run using that setup. After each run, metrics are collected and the confusion matrix will be plotted. This will be repeated until all the folds are used.

In [13]:
import numpy as np
from inference import Inference
from utils import get_sequential_input

# random.seed(715) # 715 looks good.
random.seed(785)  # 785 makes "other" looks bad, otherwise is good.


model: tf.keras.Model = ...
if notebook_mode == 1:
    model = tf.keras.models.load_model(filepath="{}cow_model.keras".format(SCRIPT_PATH))
elif notebook_mode == 2:
    model = tf.keras.models.load_model(filepath="./out/cow_model.keras")

engine: Inference = Inference(
    model=model,
    loss_function=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    loss_metric=tf.keras.metrics.Mean(),
    optimizer=tf.keras.optimizers.Adam(),
    accuracy=tf.keras.metrics.SparseCategoricalAccuracy(),
    batch_size=32,
)

# Test the model with the current fold.
x_test: np.ndarray = get_sequential_input(
    df=full_data,
    window_size=window_size,
    labelled=False,
)
test_ds = tf.data.Dataset.from_tensor_slices(x_test).batch(batch_size)

# Process test results and output confusion matrices.
y_pred: np.ndarray = model.predict(test_ds)

# There are some rows that are not included in the windows (some final rows).
trimmed_rows: int = len(full_data) % window_size
missing_data: pd.DataFrame = full_data.iloc[-window_size:]
padding_data: pd.DataFrame = full_data.iloc[1:2]
# Append a padding row to the dataframe because `get_squential_input` automatically rejects
# dataframes with 600 or less row.
trimmed_data: pd.DataFrame = pd.concat(
    objs=[missing_data, padding_data],
)
x_test_trimmed: np.ndarray = get_sequential_input(
    df=trimmed_data,
    window_size=window_size,
    labelled=False,
)
trimmed_test_ds = tf.data.Dataset.from_tensor_slices(x_test_trimmed).batch(batch_size)
trimmed_y_pred: np.ndarray = model.predict(trimmed_test_ds)


y_pred_cm: tf.Tensor = (
    tf.math.argmax(
        y_pred,
        axis=2,
        output_type=tf.int64,
    ),
)[0]
y_pred_cm = tf.reshape(
    tensor=y_pred_cm,
    shape=(-1),
)

trimmed_y_pred_cm: tf.Tensor = (
    tf.math.argmax(
        trimmed_y_pred,
        axis=2,
        output_type=tf.int64,
    ),
)[0]
trimmed_y_pred_cm = tf.reshape(
    tensor=trimmed_y_pred_cm,
    shape=(-1),
)
gc.collect()



4

In [15]:
predicted_labels: np.ndarray = np.array(
    y_pred_cm,
    dtype=np.int32,
)

trimmed_predicted_labels: np.ndarray = np.array(
    y_pred_cm,
    dtype=np.int32,
)[:trimmed_rows]

labelled_classes: np.ndarray = np.concatenate(
    (predicted_labels, trimmed_predicted_labels),
    axis=0,
    dtype=np.int32,
)
labelled_classes[:10]

array([2, 1, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

In [16]:
label_map: dict[int, str] = {
    0: "g",
    1: "i",
    2: "o",
    3: "r",
}

label_col: pd.Series = pd.Series(labelled_classes)
label_col = label_col.map(label_map)
full_data_export["label"] = label_col

In [17]:
full_data.head()

Unnamed: 0,acc_axis1,acc_axis2,acc_axis3,acc_mag,lpf_axis1,lpf_axis2,lpf_axis3,lpf_mag,hpf_axis1,hpf_axis2,...,dis_axis2_denoised,dis_axis3_denoised,dis_mag_denoised,tilt_axis1,tilt_axis2,tilt_axis3,tilt_axis1_denoised,tilt_axis2_denoised,tilt_axis3_denoised,timestamps
0,2.910643,-0.888178,-0.363978,1.416297,1.155464,-3.026575,-1.524282,-0.948874,2.727407,-0.679437,...,-1.730424,-1.730604,-1.730428,2.440463,-3.024906,1.575285,-1.793828,-3.099962,-2.447057,2023-04-14 18:01:01.000
1,-0.531529,-0.888178,0.833318,-0.008239,1.163191,-3.009148,-1.520301,-0.938773,-0.730806,-0.684524,...,-1.726959,-1.727138,-1.726964,2.440649,-3.010061,1.584154,-1.785593,-3.069552,-2.430242,2023-04-14 18:01:01.034
2,-1.678919,0.424239,-0.363978,-1.244722,1.171363,-2.991729,-1.516404,-0.92849,-1.883928,0.62751,...,-1.723494,-1.723672,-1.723499,2.441238,-2.995326,1.593507,-1.777358,-3.039141,-2.413427,2023-04-14 18:01:01.067
3,-0.531529,-0.888178,-0.363978,-0.862622,1.179175,-2.974356,-1.51231,-0.918285,-0.731955,-0.694926,...,-1.72003,-1.720206,-1.720034,2.441433,-2.980603,1.602317,-1.769122,-3.00873,-2.396611,2023-04-14 18:01:01.100
4,-0.531529,-0.888178,0.833318,-0.008239,1.187032,-2.9569,-1.50841,-0.90816,-0.732451,-0.700226,...,-1.716565,-1.716741,-1.716569,2.441765,-2.965737,1.611396,-1.760884,-2.978324,-2.379795,2023-04-14 18:01:01.134


In [18]:
np.savetxt(
    fname="{}out/predicted_labels.txt".format(OUTPUT_PATH),
    X=labelled_classes,
    fmt="%d",
)
full_data_export.to_csv(
    path_or_buf="{}out/MOS2E03230475_30Hz_proc_labelled.csv".format(OUTPUT_PATH),
    index=False,
)