# Evaluating models on the dataset

## Imports

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from wp8.pre_processing.utils import listdir_nohidden_sorted as lsdir
from tqdm.notebook import tqdm
from wp8.pre_processing.generators import TimeSeriesGenerator as TSG
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import wandb
from wandb.keras import WandbCallback

In [None]:
# %env WANDB_API_KEY=$a22c5c63cb14ecd62db2141ec9ca69d588a6483e

## Load dataset and features

In [None]:
features_path = "../outputs/dataset/features/"
dataset_path = "../outputs/dataset/dataset/"

#load features
all_features = []
all_features_paths = lsdir(features_path)[0:2]
for _, feature_file in enumerate(tqdm(all_features_paths)):
  with np.load(feature_file) as features:
      all_features.append(features["arr_0"])
      
all_features=np.concatenate(all_features, axis=0)

In [None]:
dfs = []
for _,filename in enumerate(tqdm(lsdir(dataset_path)[0:2])):
  df = pd.read_csv(filename, index_col=0)
  dfs.append(df)

dataset = pd.concat(dfs, ignore_index=True)

In [None]:
print(dataset.shape, all_features.shape)

In [None]:
dataset.head(-10)

In [None]:
names = dataset["frame_name"]
cams = []
for name in names:
  cams.append(int(name[-6]))

dataset["cams"] = pd.Series(cams)

dataset.head()

In [None]:
dataset["features"] = pd.Series(all_features.tolist())

In [None]:
print(dataset["features"].isna().sum())

In [None]:
#count samples per label
dataset["micro_labels"].value_counts()

In [None]:
le = preprocessing.LabelEncoder()
encoded_labels = le.fit_transform(dataset["micro_labels"])
n_labels = len(np.unique(encoded_labels))
print("n_labels: ", n_labels)

## Train Test split

In [None]:
len = int(dataset.shape[0] * 0.7)
X_train = np.array(dataset["features"][0:len].tolist())
X_test = np.array(dataset["features"][len:].tolist())

y_train = encoded_labels[0:len]
y_test = encoded_labels[len:]

cams_train = dataset["cams"][0:len]
cams_test = dataset["cams"][len:]

## Train

In [None]:
run = wandb.init(project = "WP8",
                 config = {
                   "epochs": 1,
                    "sequence_length": 32,
                    "num_features": 2048,
                    "batch_size": 32,
                    "sliding_window_stride": 20,
                   "loss_function": "sparse_categorical_crossentropy",
                   "architecture": "LSTM",
                   "dataset": "single_file",
                 })
config = wandb.config

train_gen = TSG(X=X_train, y = y_train, num_features=config.num_features, cams=cams_train.tolist(), batch_size = config.batch_size, stride=config.sliding_window_stride, seq_len = config.sequence_length)

test_gen = TSG(X=X_test, y = y_test, cams=cams_test.tolist(), num_features=config.num_features, batch_size = config.batch_size, stride=config.sliding_window_stride, seq_len = config.sequence_length)

model = Sequential()
model.add(LSTM(units=128, input_shape=(20, config.num_features)))
model.add(Dense(n_labels, activation = "softmax"))
model.compile(optimizer="adam", loss=config.loss_function, metrics = ["acc"])
model.summary()

In [None]:
model.fit(train_gen, validation_data=test_gen, epochs=config.epochs, callbacks=[WandbCallback()])

In [None]:
#run.join()