In [66]:
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, GlobalAveragePooling1D, Conv2D, ConvLSTM2D, ConvLSTM1D, Input, Flatten, Reshape, TextVectorization
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.utils import pad_sequences
#from tensorflow.keras import ops
import keras
import numpy as np

tf.config.run_functions_eagerly(True)


In [67]:
"""Vars"""
sample_headlines = ["Hoo Hoo", "HOO", "WHOSE TOES", "HOO", "Hoo hoo hoo"]
sample_prices = [34.3, 40.4, 90, 30, 0.5]
sample_gains = [0] + [(sample_prices[i+1]-sample_prices[i])/sample_prices[i] for i in range(0, len(sample_prices)-1)]
var_duration = 2
sample_vars = [0 for i in range(var_duration)] + [np.std(sample_prices[i:i+var_duration]) for i in range(0, len(sample_prices)-var_duration)]
print("sample gains: ", sample_gains)
print("smaple vars: ", sample_vars)
#print("zipped: ", np.array(zip(np.array(sample_gains), np.array(sample_vars))))
zipped_labels = [[sample_gains[i], sample_vars[i]] for i in range(len(sample_gains))]
print("zipped: ", zipped_labels)

vocab_size = 50
max_len = 30
embeddings_dim = 1#5


sample gains:  [0, 0.1778425655976677, 1.2277227722772277, -0.6666666666666666, -0.9833333333333333]
smaple vars:  [0, 0, 3.0500000000000007, 24.8, 30.0]
zipped:  [[0, 0], [0.1778425655976677, 0], [1.2277227722772277, 3.0500000000000007], [-0.6666666666666666, 24.8], [-0.9833333333333333, 30.0]]


In [68]:
"""Encoder Methods"""
def get_one_hot_encoded_batch(vocab_size, strings):
    return [one_hot(string, vocab_size) for string in strings]

def pad_input(max_len, one_hot_encoded_strings):
    return pad_sequences(one_hot_encoded_strings, maxlen=max_len)



In [69]:
"""Encoder Architecture"""
input = Input(shape=(max_len))
embeddings_1 = Embedding(input_dim=vocab_size, output_dim=embeddings_dim, input_length=max_len)(input) # look at TextVecotirzation
#flatten_1 = Flatten()(embeddings_1) 
#reshape_1 = Reshape((max_len, -1))(embeddings_1)
lstm_1 = LSTM(units=30, return_sequences=False)(embeddings_1) # Default activation tanh - VERIFY
dense_1 = Dense(units=30)(lstm_1)
dense_2 = Dense(units=1)(dense_1)
output = dense_2 # Perhaps more to come

encoder = keras.Model(inputs = input, outputs = output)
encoder.compile(optimizer="rmsprop", loss="mse")
print(encoder.summary())

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_19 (InputLayer)       [(None, 30)]              0         
                                                                 
 embedding_17 (Embedding)    (None, 30, 1)             50        
                                                                 
 lstm_14 (LSTM)              (None, 30)                3840      
                                                                 
 dense_28 (Dense)            (None, 30)                930       
                                                                 
 dense_29 (Dense)            (None, 1)                 31        
                                                                 
Total params: 4851 (18.95 KB)
Trainable params: 4851 (18.95 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [70]:
"""Encoder Architecture 2
input = Input(shape=(max_len, vocab_size))
embeddings_1 = Embedding(input_dim=vocab_size, output_dim=embeddings_dim, input_length=max_len)(input)
#flatten_1 = Flatten()(embeddings_1)
reshape_1 = Reshape((max_len, vocab_size))(embeddings_1)
lstm_1 = LSTM(units=30, return_sequences=False)(reshape_1) # Default activation tanh - VERIFY
dense_1 = Dense(units=30)(lstm_1)
dense_2 = Dense(units=2)(dense_1)
output = dense_2 # Perhaps more to come

encoder = keras.Model(inputs = input, outputs = output)
encoder.compile(optimizer="rmsprop", loss="mse")
print(encoder.summary())"""

'Encoder Architecture 2\ninput = Input(shape=(max_len, vocab_size))\nembeddings_1 = Embedding(input_dim=vocab_size, output_dim=embeddings_dim, input_length=max_len)(input)\n#flatten_1 = Flatten()(embeddings_1)\nreshape_1 = Reshape((max_len, vocab_size))(embeddings_1)\nlstm_1 = LSTM(units=30, return_sequences=False)(reshape_1) # Default activation tanh - VERIFY\ndense_1 = Dense(units=30)(lstm_1)\ndense_2 = Dense(units=2)(dense_1)\noutput = dense_2 # Perhaps more to come\n\nencoder = keras.Model(inputs = input, outputs = output)\nencoder.compile(optimizer="rmsprop", loss="mse")\nprint(encoder.summary())'

In [71]:
"""Data Processing"""
one_hots = get_one_hot_encoded_batch(vocab_size=vocab_size, strings=sample_headlines)
padded_one_hots = pad_input(max_len=max_len, one_hot_encoded_strings=one_hots)
print(padded_one_hots)
dataset = tf.data.Dataset.from_tensor_slices(sample_headlines)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  3  3]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  3]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0 17 10]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  3]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  3  3  3]]


In [72]:
"""Model Training"""
encoder.fit(x=padded_one_hots, y=np.array(zipped_labels), epochs=10)#np.array(zip(sample_gains, sample_vars)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10




Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2966e6f90>

In [75]:
"""Encoder Architecture 3"""
input = Input(shape=(1, ), dtype=tf.string)

tv_layer = TextVectorization(max_tokens=1000, output_mode='int', output_sequence_length=max_len)
tv_layer.adapt(dataset.batch(64))

tv_layer_1 = tv_layer(input)
embeddings_1 = Embedding(input_dim=vocab_size, output_dim=embeddings_dim, input_length=max_len)(tv_layer_1)
#flatten_1 = Flatten()(embeddings_1)
#reshape_1 = Reshape((max_len, vocab_size))(embeddings_1)
lstm_1 = LSTM(units=30, return_sequences=False)(embeddings_1) # Default activation tanh - VERIFY
dense_1 = Dense(units=30)(lstm_1)
dense_2 = Dense(units=2)(dense_1)
output = dense_2 # Perhaps more to come

encoder = keras.Model(inputs = input, outputs = output)
encoder.compile(optimizer="rmsprop", loss="mse")
print(encoder.summary())

Model: "model_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_21 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_4 (Text  (None, 30)                0         
 Vectorization)                                                  
                                                                 
 embedding_19 (Embedding)    (None, 30, 1)             50        
                                                                 
 lstm_16 (LSTM)              (None, 30)                3840      
                                                                 
 dense_32 (Dense)            (None, 30)                930       
                                                                 
 dense_33 (Dense)            (None, 2)                 62        
                                                          

In [76]:
"""Model Training"""
encoder.fit(x=np.array(sample_headlines), y=np.array(zipped_labels), epochs=10)#np.array(zip(sample_gains, sample_vars)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2975bead0>

In [77]:
"""Encoder Architecture 4"""
input = Input(shape=(1, ), dtype=tf.string)

tv_layer = TextVectorization(max_tokens=1000, output_mode='int', output_sequence_length=max_len)
tv_layer.adapt(dataset.batch(64))

tv_layer_1 = tv_layer(input)
embeddings_1 = Embedding(input_dim=vocab_size, output_dim=embeddings_dim, input_length=max_len)(tv_layer_1)
#flatten_1 = Flatten()(embeddings_1)
#reshape_1 = Reshape((max_len, vocab_size))(embeddings_1)
lstm_1 = LSTM(units=30, return_sequences=False)(embeddings_1) # Default activation tanh - VERIFY
#dense_1 = Dense(units=30)(lstm_1)
output = lstm_1#dense_1


encoder = keras.Model(inputs = input, outputs = output)
encoder.compile(optimizer="rmsprop", loss="mse")
print(encoder.summary())

Model: "model_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_22 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_5 (Text  (None, 30)                0         
 Vectorization)                                                  
                                                                 
 embedding_20 (Embedding)    (None, 30, 1)             50        
                                                                 
 lstm_17 (LSTM)              (None, 30)                3840      
                                                                 
Total params: 3890 (15.20 KB)
Trainable params: 3890 (15.20 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
