## Dataset

We want text as text and prices of the currencies.

I.e. we want to see if the text has any influence on the prices of the currencies.

Problems: Where do we get dataset? Create one yourself?

In [None]:
# Load dataset, e.g. the same as the following (just a random dataset, i.e. a language dataset for finish QA):

import pandas as pd

train_df_translated = pd.read_csv("translated_train_data.csv")

validation_df_translated = pd.read_csv("translated_validation_data.csv")

# Finnish
finnish_train_df = train_df_translated[train_df_translated["lang"] == "fi"].reset_index()
finnish_val_df = validation_df_translated[validation_df_translated["lang"] == "fi"].reset_index()

# Choose what we want to be the input, i.e. in this case it will be question and the context column concatinated into the "all_text" column:

finnish_train_df["all_text"] = finnish_train_df["question_en"] + " <concat> " + finnish_train_df["context"]
finnish_val_df["all_text"] = finnish_val_df["question_en"] + " <concat> " + finnish_val_df["context"]

finnish_train_df.head()


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# We need text vectorizer, as the model can not read text. I.e. we create a vector, essentially mapping each word to an integer, so that the model can actually read it.
# This is our tokenization in this case, e.g. tokenization could also be unigram, bigram and so on (i think)

# NOTE: The following uses IOB tags - we won't need that in our case, atleast i can't see a reason to

def create_and_adapt_vectorizer(text_data, max_tokens=20000, max_length=200):
    vectorizer = TextVectorization(
        max_tokens=max_tokens,
        output_mode='int',
        output_sequence_length=max_length
    )
    vectorizer.adapt(text_data.values)
    return vectorizer

# Adapt vectorizer for Finnish dataset
finnish_vectorizer = create_and_adapt_vectorizer(finnish_train_df["all_text"])

#labels = finnish_train_df["zipped_tokens_labels"] eller er det iob_labels?

# Sequences = what we want as input, in this case it will be the "all_text" column
# Labels = what we are trying to predict, in this case it will be the "iob_labels" (these have not been generated in this example.)
def preprocess_data_with_vectorizer(df, vectorizer):
    sequences = vectorizer(df['all_text'].values)  # Tokenize and transform text data
    labels = np.array(df['iob_labels'])  # Convert labels to numpy array - use iob_labels, we don't need to consider the tokens
    return sequences, labels

# Preprocess training and validation data:
finnish_train_padded, finnish_train_labels = preprocess_data_with_vectorizer(finnish_train_df, finnish_vectorizer)
finnish_val_padded, finnish_val_labels = preprocess_data_with_vectorizer(finnish_val_df, finnish_vectorizer)

# Encode IOB labels ('O', 'B', 'I') to numerical form, since it cannot read O, I, B
iob_tag_to_index = {'O': 0, 'B': 1, 'I': 2}

def encode_iob_labels(iob_labels, max_length=200):
    # First, convert the IOB tags to their respective indices
    encoded_labels = [[iob_tag_to_index[tag] for tag in label_sequence] for label_sequence in iob_labels]
    
    # Now pad the sequences to ensure they all have the same length - if they don't have the same length, we get an error - padding shouldnt't really affect anything
    padded_encoded_labels = pad_sequences(encoded_labels, maxlen=max_length, padding='post', value=iob_tag_to_index['O'])
    
    return np.array(padded_encoded_labels)


# Encode and pad the labels
finnish_train_encoded_labels = encode_iob_labels(finnish_train_labels) # Will just have max_length=200
finnish_val_encoded_labels = encode_iob_labels(finnish_val_labels)


## Model:

In [None]:
from tensorflow.keras import layers, models

#max_length = 200 # same as vectorizer

def build_qa_model(vocab_size, embedding_dim=128, rnn_units=64, output_dim=3, max_length=200):
    model = models.Sequential([
        layers.Input(shape=(max_length,)),  # Input shape should match the vectorizer's output sequence length
        layers.Embedding(vocab_size, embedding_dim),
        layers.Bidirectional(layers.LSTM(rnn_units, return_sequences=True)),  # Bidirectional LSTM for sequence labeling
        layers.TimeDistributed(layers.Dense(output_dim, activation='softmax'))  # Predicts IOB tag for each token
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create and compile the model
vocab_size = finnish_vectorizer.vocabulary_size()
max_length = 200 # same as vectorizer - idk why i cant just access the attribute
qa_model = build_qa_model(vocab_size=vocab_size, max_length=max_length)

# Train the model
qa_model.fit(finnish_train_padded, finnish_train_encoded_labels, 
             validation_data=(finnish_val_padded, finnish_val_encoded_labels), 
             epochs=2, batch_size=32)
