In [2]:
import sys
import os

# Add the root directory to the system path
root_path = os.path.abspath(os.path.join(".."))
if root_path not in sys.path:
    sys.path.append(root_path)

from src.tokenizer import Tokenizer

tokenizer = Tokenizer()
tokenizer.load()

In [10]:
import torch
import torch.nn.functional as F
from src.modelTokenized import Model


HSIZE = 128
SEQUENCE_LENGTH = 30

model = Model(tokenizer.N, HSIZE)

In [12]:
with open("../model_weights_tokenized", "rb") as file:
    model.load_state_dict(torch.load(file))

In [13]:
print(model.gru.bN_u.runningMean)

Parameter containing:
tensor([ -30.4089,  -80.0266,  -26.7948,  -49.0246,  -94.4079,  -53.6148,
         -34.6053,  -44.0101,  -54.3059,  -48.9706,  -25.2879,  -59.6890,
         -47.9147,  -34.7455,  -86.0674,  -48.8964, -127.9646,  -75.2269,
         -13.7266,  -42.8818,  -25.8266,  -51.7554,  -48.5940,  -15.2906,
        -117.0929, -105.1198,  -43.1326,  -64.5696,  -53.4824,  -30.3757,
         -98.9775,  -82.2478, -109.3255,  -51.0680,  -11.0687,  -73.8882,
          -7.4569,  -99.1451, -148.4676,  -45.7623,  -93.8075,  -36.0474,
         -52.2784,  -58.9071,  -11.3402,  -47.0831,  -15.2010,  -50.4270,
         -46.1610,  -42.2677,  -23.9342, -105.1030,  -73.9310, -106.9341,
         -51.1147,    9.2089,  -13.9579,  -74.1559, -101.7168,  -69.1680,
         -46.2908,  -94.2295,  -76.7574,  -47.2658, -146.3714,  -69.7924,
         -88.2510, -108.0595,  -33.9888, -103.3003,  -45.8423,    7.8727,
         -14.8264,  -37.8998,  -37.1361,  -55.3104,  -41.1776,  -13.2928,
         -44.992

In [14]:
tokenizer.chars_to_tokens[" "]

0

In [15]:
def generate_char(tokens):
    logits = model.forward(tokens, train=False)
    probs = F.softmax(logits, dim=1)
    generated = torch.multinomial(probs, 1).item()
    new_char = tokenizer.tokens_to_chars[generated]
    return generated, new_char

In [28]:
def generate_text(text, num):
    tokens = tokenizer.encode(text)
    if len(tokens) > SEQUENCE_LENGTH:
        tokens = tokens[:SEQUENCE_LENGTH]
        print("WARNING: Trunkating")
    tokens = [0] * (SEQUENCE_LENGTH - len(tokens)) + tokens
    tokens = torch.tensor(tokens).reshape(1, -1)
    for i in range(num):
        token, new_char = generate_char(tokens)
        text += new_char
        tokens = torch.roll(tokens, -1, 1)
        tokens[0][-1] = token

    print(text)

In [17]:
a = [1,2,3]
a = [5] * (5-len(a)) + a
a

[5, 5, 1, 2, 3]

In [30]:
generate_text("the stock went down after it", 50)

46
50
53
60
66
70
76
86
114
119
129
140
159
221
232
260
296
422
464
644
875
the stock went down after ity 
 mr. steice said he is n't chance it to the national neti even great and make apard the search face to favor court  the programs which lynon there are so  that does n't get any 


In [14]:
import numpy as np

with open("../data/X_test", "rb") as file:
    seq = np.load(file)

In [32]:
tokenizer.chars_to_tokens

{' ': 0,
 'a': 1,
 'e': 2,
 'r': 3,
 'b': 4,
 'n': 5,
 'k': 6,
 'o': 7,
 't': 8,
 'l': 9,
 'i': 10,
 'z': 11,
 'c': 12,
 'w': 13,
 'y': 14,
 'u': 15,
 's': 16,
 'f': 17,
 'm': 18,
 'g': 19,
 'h': 20,
 'd': 21,
 '-': 22,
 'q': 23,
 'p': 24,
 'x': 25,
 '\n': 26,
 'j': 27,
 'v': 28,
 '.': 29,
 "'": 30,
 '1': 31,
 '9': 32,
 '5': 33,
 '0': 34,
 '&': 35,
 'V': 36,
 '3': 37,
 '2': 38,
 '$': 39,
 '4': 40,
 '8': 41,
 '6': 42,
 '7': 43,
 '#': 44,
 '|': 45,
 'e ': 46,
 's ': 47,
 ' t': 48,
 'in': 49,
 't ': 50,
 ' th': 51,
 'd ': 52,
 'er': 53,
 'an': 54,
 'on': 55,
 ' the ': 56,
 'or': 57,
 'y ': 58,
 'ar': 59,
 'en': 60,
 'al': 61,
 'o ': 62,
 '\n ': 63,
 're': 64,
 'of': 65,
 'th': 66,
 'ti': 67,
 'a ': 68,
 'ing': 69,
 'st': 70,
 'ou': 71,
 'on ': 72,
 'es ': 73,
 'om': 74,
 'ed ': 75,
 'er ': 76,
 'il': 77,
 'at': 78,
 'ing ': 79,
 'of ': 80,
 'ic': 81,
 'and ': 82,
 'in ': 83,
 'li': 84,
 ' to ': 85,
 'it': 86,
 'ro': 87,
 'as ': 88,
 'ch': 89,
 '. ': 90,
 'for': 91,
 'al ': 92,
 'ec': 93,
