# Model training lab

This is the notebook for loading and training models.
Furthermore it provides simple documentation for different approaches used for training a model.

Run the command below to see command-completion on pressing `TAB`.

## Prerequisits

In [1]:
# Imports
import os
import warnings
import tools
import models as c_models
import pandas as pd
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import layers, models
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.layers import Bidirectional

# Ignore future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Root CSV files directory
dirname = "/home/datagroup/Videos/SL/output/csv/"

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Preparation Stage
### Load data

In [23]:
listfile = os.listdir(dirname)
contents = []
for wordname in listfile:
    if wordname == ".DS_Store":
        continue
    for csv in os.listdir(dirname + wordname):
        filepath = os.path.join(dirname, wordname, csv)
        content = pd.read_csv(filepath, sep=';')
        content = content.reindex(list(range(0, 100)))
        for i in range(55, 100):
            content = content.append(pd.Series(), ignore_index=True)
        content.fillna(0.0)
        contents.append((wordname, content))
data = contents
contents[0][1].describe()

Unnamed: 0,face_x,face_y,landmark_x_1,landmark_y_1,landmark_x_2,landmark_y_2,landmark_x_3,landmark_y_3,landmark_x_4,landmark_y_4,...,landmark_x_38,landmark_y_38,landmark_x_39,landmark_y_39,landmark_x_40,landmark_y_40,landmark_x_41,landmark_y_41,landmark_x_42,landmark_y_42
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.423129,0.299152,0.165766,0.308951,0.175622,0.3132,0.183943,0.313322,0.190281,0.314959,...,0.46488,0.734882,0.481533,0.667846,0.480131,0.681474,0.476105,0.71219,0.472616,0.733929
std,0.011926,0.005767,0.195172,0.342159,0.198999,0.345818,0.204568,0.344697,0.209548,0.345273,...,0.105641,0.040387,0.146333,0.046403,0.13606,0.041896,0.130853,0.045907,0.127918,0.049162
min,0.403872,0.286105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.349456,0.69525,0.319757,0.631866,0.330062,0.652155,0.332186,0.670832,0.33181,0.688324
25%,0.41433,0.294871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.352188,0.715104,0.322756,0.634509,0.332311,0.655918,0.334262,0.684355,0.334761,0.708868
50%,0.424998,0.29888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.522606,0.71705,0.584762,0.657085,0.572692,0.657265,0.559502,0.693566,0.553065,0.712975
75%,0.430079,0.302729,0.312071,0.606342,0.349289,0.636691,0.373045,0.654049,0.391925,0.661954,...,0.53046,0.74839,0.589919,0.669903,0.578837,0.691057,0.56664,0.726587,0.555823,0.745653
max,0.441718,0.312287,0.618617,0.886661,0.578861,0.88376,0.56347,0.878666,0.549722,0.858838,...,0.569689,0.798616,0.590473,0.745866,0.586751,0.750975,0.587936,0.785609,0.587621,0.813824


### Normalize
For training it's required to extend/reduce every dataset to n frames.

In [24]:
# Frame count
frames = 100
for i in range(0, 100):
    data.extend()

TypeError: extend() takes exactly one argument (0 given)

### Split data
Split the dataset up into the following segments:
1. Training Data: 66%
2. Test Data: 33%
3. Validation Data: None

In [29]:
features = [n[1] for n in data]
labels = [n[0] for n in data]
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)

In [30]:
# Display train data
print("Total:", len(labels))
print("Training:", len(y_train), len(y_train) / len(labels) * 100)
print("Test:", len(y_test), len(y_test) / len(labels) * 100)
#print("Validation:", len(y_val), len(y_val) / len(labels) * 100)

Total: 347
Training: 232 66.85878962536023
Test: 115 33.14121037463977


### Tokenize (One Hot)

In [31]:
import tools
tokenizer = tools.tokenize(dirname)
print(tokenizer.word_index)
encoded_train=tokenizer.texts_to_sequences([y_train])[0]
encoded_test=tokenizer.texts_to_sequences([y_test])[0]
y_train = to_categorical(encoded_train)
y_test = to_categorical(encoded_test)
print(y_train)

{'computer': 1, 'hallo': 2, 'welt': 3, 'deutschland': 4}
[[0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]]


## Training Stage

In [32]:
model = c_models.build_model();

Instructions for updating:
Colocations handled automatically by placer.


In [33]:
history=model.fit(x_train,y_train,epochs=100,batch_size=32,validation_data=(x_test,y_test))

ValueError: Error when checking input: expected lstm_input to have 3 dimensions, but got array with shape (145, 86)