# Pre-trained CNN for behaviours detection

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports and constants

In [2]:
import os
import random
from os import listdir
from os.path import isfile, join

import tensorflow as tf
from keras import Sequential
from keras.layers import (
    GRU,
    BatchNormalization,
    Bidirectional,
    Conv1D,
    Conv2D,
    Dense,
    Dropout,
    Embedding,
    Flatten,
    MaxPool1D,
    MaxPool2D,
    RepeatVector,
    TimeDistributed,
)
from keras.losses import MeanSquaredError, SparseCategoricalCrossentropy
from keras.optimizers import SGD, Adam

print(tf.__version__)

2.14.0


In [3]:
import gc
import sqlite3

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import class_weight

In [4]:
training_device: str = "cpu"

if str.lower(training_device) == "cpu":
    tf.config.set_visible_devices(
        devices=[],
        device_type="GPU",
    )
else:
    physical_devices = tf.config.experimental.list_physical_devices("GPU")
    assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
    config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Data pre-processing

### Load the Dataset

In [5]:
def load_data_sqlite() -> pd.DataFrame:
    connection: sqlite3.Connection = sqlite3.connect(
        database="drive/MyDrive/Ellinbank/video_observation/data/ellinbank_cow-observation_labeled-data.db"
    )
    cursor: sqlite3.Cursor = connection.cursor()

    DATABASE_NAME: str = "LebeledDataTable"

    query: str = """
    select distinct serial_number
    from {}
    """.format(
        DATABASE_NAME
    )

    response: sqlite3.Cursor = cursor.execute(query)
    file_names: list = list(response)
    file_names = [file[0] for file in file_names]

    dataframes: list = []
    for file in file_names:
        query: str = """
            select *
            from {}
            where serial_number = '{}'
        """.format(
            DATABASE_NAME,
            file,
        )
        df: pd.DataFrame = pd.read_sql_query(
            sql=query,
            con=connection,
        )
        dataframes.append(df)
        # break

    dataframe: pd.DataFrame = pd.concat(dataframes)
    return dataframe

In [6]:
notebook_mode: str = "deploying"
if notebook_mode == "developing":
    full_data: pd.DataFrame = pd.read_csv(
        filepath_or_buffer="../../../data/MOS2E03230475_30Hz_proc.csv",
        nrows=1000,
    )
else:
    !cp drive/MyDrive/Ellinbank/video_observation/custom_train_test_loop/data-labelling_conv1d-seq2seq-model/custom_model.py .
    !cp drive/MyDrive/Ellinbank/video_observation/custom_train_test_loop/data-labelling_conv1d-seq2seq-model/inference.py .
    !cp drive/MyDrive/Ellinbank/video_observation/custom_train_test_loop/data-labelling_conv1d-seq2seq-model/utils.py .
    # full_data: pd.DataFrame = load_data_sqlite()
    full_data: pd.DataFrame = pd.read_csv(
        # filepath_or_buffer="../../../data/MOS2E03230475_30Hz_proc.csv",
        filepath_or_buffer="/content/drive/MyDrive/Ellinbank/video_observation/data/MOS2E03230475_30Hz_proc.csv",
    )

### Check for duplicated rows and null values

In [7]:
duplicated_rows: pd.Series = full_data.duplicated()
duplicated_rows = duplicated_rows[duplicated_rows == True]
print("{} rows duplicated".format(duplicated_rows.shape))

full_data.drop_duplicates(inplace=True)

print(full_data.isnull().sum())
print(full_data.isna().sum())

(0,) rows duplicated
serial_number               0
nickname               939987
animalID               939987
sample_rate                 0
timestamps                  0
acc_axis1                   0
acc_axis2                   0
acc_axis3                   0
acc_mag                     0
lpf_axis1                   0
lpf_axis2                   0
lpf_axis3                   0
lpf_mag                     0
hpf_axis1                   0
hpf_axis2                   0
hpf_axis3                   0
hpf_mag                     1
spd_axis1                   1
spd_axis2                   1
spd_axis3                   1
spd_mag                     1
spd_axis1_denoised          1
spd_axis2_denoised          1
spd_axis3_denoised          1
spd_mag_denoised            1
dis_axis1                   1
dis_axis2                   1
dis_axis3                   1
dis_mag                     1
dis_axis1_denoised          1
dis_axis2_denoised          1
dis_axis3_denoised          1
dis_mag_denoised   

In [8]:
full_data.head(1)

Unnamed: 0,serial_number,nickname,animalID,sample_rate,timestamps,acc_axis1,acc_axis2,acc_axis3,acc_mag,lpf_axis1,...,dis_axis1_denoised,dis_axis2_denoised,dis_axis3_denoised,dis_mag_denoised,tilt_axis1,tilt_axis2,tilt_axis3,tilt_axis1_denoised,tilt_axis2_denoised,tilt_axis3_denoised
0,MOS2E03230475,,,30,2023-04-14 18:01:01.000,0.645,0.309,0.715,1.011302,0.636648,...,1874.156185,1694.906462,192.005856,2534.173547,39.120814,18.021817,44.646077,39.105397,18.027755,44.634478


### Sort data based on date
This ensure that actvities are read in a consecutive order and reseting the index of readings (rows).

In [9]:
full_data.sort_values(
    by=["timestamps"],
    ascending=True,
    inplace=True,
)

full_data = full_data.reset_index(drop=True)

### Convert numerical values (input data) to float type

In [10]:
full_data_export: pd.DataFrame = full_data.copy()
full_data.iloc[:, 5:] = full_data.iloc[:, 5:].astype("float")
# train_df: pd.DataFrame = full_data.copy()

gc.collect()

0

### Keep only the timestamp and the data columns.

In [11]:
timestamp_col_index: int = full_data.columns.get_loc("timestamps")
print(timestamp_col_index)
kept_cols: list[str] = full_data.columns[timestamp_col_index:]
full_data = full_data[kept_cols]
full_data.head()

4


Unnamed: 0,timestamps,acc_axis1,acc_axis2,acc_axis3,acc_mag,lpf_axis1,lpf_axis2,lpf_axis3,lpf_mag,hpf_axis1,...,dis_axis1_denoised,dis_axis2_denoised,dis_axis3_denoised,dis_mag_denoised,tilt_axis1,tilt_axis2,tilt_axis3,tilt_axis1_denoised,tilt_axis2_denoised,tilt_axis3_denoised
0,2023-04-14 18:01:01.000,0.645,0.309,0.715,1.011302,0.636648,0.312169,0.717877,1.009018,0.009425,...,1874.156185,1694.906462,192.005856,2534.173547,39.120814,18.021817,44.646077,39.105397,18.027755,44.634478
1,2023-04-14 18:01:01.034,0.633,0.309,0.719,1.006544,0.636649,0.312171,0.717878,1.009019,-0.002577,...,1875.197894,1695.848197,192.113417,2535.581948,39.120815,18.021889,44.64612,39.105401,18.027766,44.634488
2,2023-04-14 18:01:01.067,0.629,0.313,0.715,1.002415,0.63665,0.312172,0.717878,1.009021,-0.00658,...,1876.239602,1696.789933,192.220978,2536.990348,39.120819,18.021961,44.646166,39.105405,18.027776,44.634498
3,2023-04-14 18:01:01.100,0.633,0.309,0.715,1.003691,0.636651,0.312174,0.717879,1.009022,-0.002581,...,1877.281309,1697.731668,192.328538,2538.398747,39.12082,18.022032,44.64621,39.105409,18.027787,44.634509
4,2023-04-14 18:01:01.134,0.633,0.309,0.719,1.006544,0.636651,0.312176,0.717879,1.009024,-0.002583,...,1878.323016,1698.673403,192.436099,2539.807146,39.120823,18.022104,44.646254,39.105413,18.027798,44.634519


### Standardize data.

In [12]:
from utils import standardize_dataframe

full_data = standardize_dataframe(
    data=full_data,
    std_cols=full_data.columns[1:],
)

gc.collect()

0

## Train-test setup.

### Hyper-parameters

In [13]:
window_size: int = 600  # 300: 10 seconds
window_per_epoch: int = 200
epoch: int = 1
batch_size: int = 64
# random.seed(715) # 715 looks good.
random.seed(785)  # 785 makes "other" looks bad, otherwise is good.

### Output folder
Output files will be stored in the same folder named `out`.

In [14]:
import os

# Check if the folder is already exists.
if os.path.isdir("./out"):
    pass
else:
    os.mkdir("./out/")

## Train and validate the model.

### Train-test loop with k-fold cross validation.
Each fold will be selected as the validation set and an entire train-test loop will be run using that setup. After each run, metrics are collected and the confusion matrix will be plotted. This will be repeated until all the folds are used.

In [15]:
from tqdm.notebook import tqdm
from custom_model import CowModel
from inference import Inference
from utils import get_frame, get_sequential_frame, get_sequential_input
from utils import plot_confusion_matrix
import logging

# random.seed(715) # 715 looks good.
random.seed(785)  # 785 makes "other" looks bad, otherwise is good.


model: tf.keras.Model = ...
if notebook_mode == "developing":
    model = tf.keras.models.load_model(filepath="./cow_model.keras")
else:
    model = tf.keras.models.load_model(filepath="drive/MyDrive/Ellinbank/video_observation/custom_train_test_loop/data-labelling_conv1d-seq2seq-model/cow_model.keras")

engine: Inference = Inference(
    model=model,
    loss_function=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    loss_metric=tf.keras.metrics.Mean(),
    optimizer=tf.keras.optimizers.Adam(),
    accuracy=tf.keras.metrics.SparseCategoricalAccuracy(),
    batch_size=32,
)

# Test the model with the current fold.
x_test: np.ndarray = get_sequential_input(
    df=full_data,
    window_size=window_size,
    labelled=False,
)
test_ds = tf.data.Dataset.from_tensor_slices(x_test).batch(batch_size)

# Flatten the windows to create the confusion matrix.
data: list = []

for datum_batch in test_ds:
    data.append(datum_batch)

# Process test results and output confusion matrices.
predict_data: np.ndarray = np.concatenate(data)
y_pred: np.ndarray = model.predict(predict_data)

# There are some rows that are not included in the windows (some final rows).
trimmed_rows: int = len(full_data) - predict_data.shape[0] * predict_data.shape[1]
missing_data: pd.DataFrame = full_data.iloc[-window_size:]
padding_data: pd.DataFrame = full_data.iloc[1:2]
# Append a padding row to the dataframe because `get_squential_input` automatically rejects
# dataframes with 600 or less row.
trimmed_data: pd.DataFrame = pd.concat(
    objs=[missing_data, padding_data],
)
x_test_trimmed: np.ndarray = get_sequential_input(
    df=trimmed_data,
    window_size=window_size,
    labelled=False,
)
trimmed_test_ds = tf.data.Dataset.from_tensor_slices(x_test_trimmed).batch(batch_size)
trimmed_predict_data: np.ndarray = np.concatenate(
    [datum_batch for datum_batch in trimmed_test_ds]
)
trimmed_y_pred: np.ndarray = model.predict(trimmed_predict_data)


y_pred_cm: tf.Tensor = (
    tf.math.argmax(
        y_pred,
        axis=2,
        output_type=tf.int64,
    ),
)[0]
y_pred_cm = tf.reshape(
    tensor=y_pred_cm,
    shape=(-1),
)

trimmed_y_pred_cm: tf.Tensor = (
    tf.math.argmax(
        trimmed_y_pred,
        axis=2,
        output_type=tf.int64,
    ),
)[0]
trimmed_y_pred_cm = tf.reshape(
    tensor=trimmed_y_pred_cm,
    shape=(-1),
)
gc.collect()



1291

In [16]:
predicted_labels: np.ndarray = np.array(
    y_pred_cm,
    dtype=np.int32,
)

trimmed_predicted_labels: np.ndarray = np.array(
    y_pred_cm,
    dtype=np.int32,
)[:trimmed_rows]

labelled_classes: np.ndarray = np.concatenate(
    (predicted_labels, trimmed_predicted_labels),
    axis=0,
    dtype=np.int32,
)
labelled_classes[:10]

array([2, 2, 1, 0, 3, 3, 3, 3, 3, 3], dtype=int32)

In [17]:
label_map: dict[int, str] = {
    0: "g",
    1: "i",
    2: "o",
    3: "r",
}

label_col: pd.Series = pd.Series(labelled_classes)
label_col = label_col.map(label_map)
full_data_export["label"] = label_col

In [18]:
full_data.head()

Unnamed: 0,acc_axis1,acc_axis2,acc_axis3,acc_mag,lpf_axis1,lpf_axis2,lpf_axis3,lpf_mag,hpf_axis1,hpf_axis2,...,dis_axis2_denoised,dis_axis3_denoised,dis_mag_denoised,tilt_axis1,tilt_axis2,tilt_axis3,tilt_axis1_denoised,tilt_axis2_denoised,tilt_axis3_denoised,timestamps
0,3.003887,-0.818894,-1.159826,0.919112,3.111862,-0.62371,-2.64556,-2.357273,2.64785,-0.67083,...,-1.853723,-2.461148,-1.862785,2.964733,1.250043,2.823044,2.691335,1.795062,2.645738,2023-04-14 18:01:01.000
1,-0.337319,-0.818894,-0.045672,-0.495178,3.115552,-0.616834,-2.645132,-2.355324,-0.714778,-0.675901,...,-1.853719,-2.461139,-1.862781,2.96476,1.253818,2.823858,2.691418,1.795704,2.645928,2023-04-14 18:01:01.034
2,-1.451054,0.483093,-1.159826,-1.722767,3.119455,-0.60996,-2.644714,-2.35334,-1.836027,0.632206,...,-1.853714,-2.46113,-1.862776,2.964845,1.257564,2.824717,2.691501,1.796346,2.646119,2023-04-14 18:01:01.067
3,-0.337319,-0.818894,-1.159826,-1.343416,3.123185,-0.603105,-2.644274,-2.351372,-0.715895,-0.686272,...,-1.85371,-2.461121,-1.862772,2.964873,1.261308,2.825526,2.691584,1.796988,2.646309,2023-04-14 18:01:01.100
4,-0.337319,-0.818894,-0.045672,-0.495178,3.126938,-0.596216,-2.643855,-2.349418,-0.716378,-0.691556,...,-1.853705,-2.461112,-1.862767,2.964922,1.265089,2.82636,2.691668,1.797631,2.6465,2023-04-14 18:01:01.134


In [19]:
np.savetxt(
    fname="./out/predicted_labels.txt",
    X=labelled_classes,
    fmt="%d",
)
full_data_export.to_csv(
    path_or_buf="./out/MOS2E03230475_30Hz_proc_labelled.csv",
    index=False,
)