In [1]:
%matplotlib widget
import numpy as np
import math
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import random

import matplotlib.pyplot as plt

import re

from tqdm import tqdm
from tqdm import tnrange, tqdm_notebook

from joblib import Parallel, delayed
import multiprocessing
from datetime import datetime

In [2]:
lstm = nn.LSTM(5, 5)

In [3]:
ins = [torch.Tensor([[[1, 0, 0, 0, 0]]]),
       torch.Tensor([[[0, 1, 0, 0, 0]]]),
       torch.Tensor([[[0, 0, 1, 0, 0]]]),
       torch.Tensor([[[0, 0, 1, 0, 0]]]),
       torch.Tensor([[[0, 0, 0, 1, 0]]])]

In [4]:
hiddens = (torch.rand(1, 1,5), torch.rand(1, 1,5))

In [5]:
for i in ins:
    y_, hid_ = lstm(i, hiddens)
    print(y_)

tensor([[[ 0.0786, -0.0476,  0.0968,  0.0770,  0.2617]]],
       grad_fn=<StackBackward>)
tensor([[[ 0.0470, -0.0168,  0.0808,  0.1874,  0.1915]]],
       grad_fn=<StackBackward>)
tensor([[[0.0424, 0.0274, 0.0344, 0.2388, 0.1601]]], grad_fn=<StackBackward>)
tensor([[[0.0424, 0.0274, 0.0344, 0.2388, 0.1601]]], grad_fn=<StackBackward>)
tensor([[[0.0575, 0.0530, 0.1741, 0.2025, 0.2021]]], grad_fn=<StackBackward>)


In [6]:
class StackedLSTM(nn.Module):
    def __init__(self):
        super(StackedLSTM, self).__init__()
        
        self.LSTM11 = nn.LSTM(5, 5)
        self.LSTM12 = nn.LSTM(5, 5)
        self.LSTM13 = nn.LSTM(5, 5)
        self.LSTM14 = nn.LSTM(5, 5)
        self.LSTM15 = nn.LSTM(5, 5)
        
        self.LSTM21 = nn.LSTM(5, 5)
        self.LSTM22 = nn.LSTM(5, 5)
        self.LSTM23 = nn.LSTM(5, 5)
        self.LSTM24 = nn.LSTM(5, 5)
        self.LSTM25 = nn.LSTM(5, 5)
        
        self.lin1 = nn.Linear(5, 5)
        self.lin2 = nn.Linear(5, 5)
        self.lin3 = nn.Linear(5, 5)
        self.lin4 = nn.Linear(5, 5)
        self.lin5 = nn.Linear(5, 5)
        
    def forward(self, x1, x2, x3, x4, x5, hidden):
        # pierwsza warstwa LSTM
        y11_, h11_ = self.LSTM11(x1, hidden)
        y12_, h12_ = self.LSTM12(x2, h11_)
        y13_, h13_ = self.LSTM13(x3, h12_)
        y14_, h14_ = self.LSTM14(x4, h13_)
        y15_, h15_ = self.LSTM15(x5, h14_)
        
        
        # residuals
        y11 = y11_ * x1
        y12 = y12_ * x2
        y13 = y13_ * x3
        y14 = y14_ * x4
        y15 = y15_ * x5
        
        
        # druga warstwa LSTM
        y21_, h21_ = self.LSTM21(y11, h15_)
        y22_, h22_ = self.LSTM22(y12, h21_)
        y23_, h23_ = self.LSTM23(y13, h22_)
        y24_, h24_ = self.LSTM24(y14, h23_)
        y25_, h25_ = self.LSTM25(y15, h24_)
        
        
        # residuals
        y21 = y21_ * y11 * x1
        y22 = y22_ * y12 * x2
        y23 = y23_ * y13 * x3
        y24 = y24_ * y14 * x4
        y25 = y25_ * y15 * x5
        
        
        # linear
        y31 = self.lin1(y21)
        y32 = self.lin2(y22)
        y33 = self.lin3(y23)
        y34 = self.lin4(y24)
        y35 = self.lin5(y25)
        
        
        # zwrot
        return torch.cat((y31, y32, y33, y34, y35)).reshape(5, -1) 

In [7]:
slstm = StackedLSTM()

In [8]:
hidden = (torch.rand((1, 1, 5)), torch.rand((1, 1, 5)))

In [9]:
x1 = torch.Tensor([[[1, 0, 0, 0, 0]]]) #h
x2 = torch.Tensor([[[0, 1, 0, 0, 0]]]) #e
x3 = torch.Tensor([[[0, 0, 1, 0, 0]]]) #l
x4 = torch.Tensor([[[0, 0, 1, 0, 0]]]) #l
x5 = torch.Tensor([[[0, 0, 0, 1, 0]]]) #o

x0 = torch.Tensor([[[0, 0, 0, 0, 0]]])

In [10]:
y = torch.Tensor([1, 2, 0, 0, 0]).long()

In [11]:
out_ = slstm(x1, x2, x3, x4, x5, hidden); out_

tensor([[-0.3294,  0.2955, -0.1982, -0.3178,  0.0461],
        [ 0.2792, -0.4267,  0.3806, -0.4317, -0.2708],
        [ 0.0957,  0.0579,  0.0309,  0.0004,  0.1527],
        [-0.4397,  0.3471,  0.2232, -0.1855, -0.1130],
        [-0.3142, -0.4397, -0.2017,  0.2163,  0.2909]],
       grad_fn=<AsStridedBackward>)

In [12]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(slstm.parameters(), lr=1e-3)

In [13]:
criterion(out_, y)

tensor(1.5913, grad_fn=<NllLossBackward>)

In [14]:
for epoch in range(1200):
    slstm.train()
    optimizer.zero_grad()
    
    out_ = slstm(x1, x2, x3, x4, x5, hidden)
    loss = criterion(out_, y)
    
    loss.backward()
    optimizer.step()
    
    if epoch%100==0:
        print(loss)

tensor(1.5913, grad_fn=<NllLossBackward>)
tensor(1.3932, grad_fn=<NllLossBackward>)
tensor(0.8944, grad_fn=<NllLossBackward>)
tensor(0.6207, grad_fn=<NllLossBackward>)
tensor(0.4611, grad_fn=<NllLossBackward>)
tensor(0.3558, grad_fn=<NllLossBackward>)
tensor(0.2821, grad_fn=<NllLossBackward>)
tensor(0.2287, grad_fn=<NllLossBackward>)
tensor(0.1888, grad_fn=<NllLossBackward>)
tensor(0.1583, grad_fn=<NllLossBackward>)
tensor(0.1346, grad_fn=<NllLossBackward>)
tensor(0.1157, grad_fn=<NllLossBackward>)


In [15]:
with torch.no_grad():
    y_ = slstm(x1, x2, x3, x4, x0, hidden)
    
    print(y_)
    
    print(y_.argmax(1))

tensor([[-1.9137,  1.9453, -2.0253, -1.8936, -1.7205],
        [-1.4611, -2.1103,  2.2152, -1.7508, -1.8543],
        [ 1.8120, -1.9558, -1.6842, -1.8843, -1.9513],
        [ 1.4069, -1.7909, -1.8340, -2.0481, -2.2379],
        [ 0.3938, -1.1962, -0.9639, -0.3831, -0.4286]])
tensor([1, 2, 0, 0, 0])


In [16]:
hidden2 = (torch.randn((1, 1, 5)), torch.randn((1, 1, 5)))

with torch.no_grad():
    y_ = slstm(x1, x2, x3, x4, x0, hidden2)
    
    print(y_)
    
    print(y_.argmax(1))

tensor([[-1.0497,  0.9904, -0.8656, -1.0399, -0.6346],
        [-0.7189, -1.4146,  1.4355, -1.3521, -1.2406],
        [ 1.0082, -0.9056, -0.9234, -0.9597, -0.8126],
        [ 1.0191, -1.2957, -1.3737, -1.6707, -1.7484],
        [ 0.3938, -1.1962, -0.9639, -0.3831, -0.4286]])
tensor([1, 2, 0, 0, 0])


## Real life

In [17]:
polskie = open("../100k.txt", encoding='utf8')
slowa = [slowo.replace("\n", "") for slowo in polskie.readlines()]
polskie.close()
print(slowa[:2])
print(len(slowa))
slowa = slowa[:5000]

['abmicro', '527579']
100000


In [18]:
chartoidx = {}


cnt = 0

longestword = 0

for slowo in slowa:
    if len(slowo) > longestword:
        longestword = len(slowo)
    
    
    for litera in slowo:
        if litera not in list(chartoidx.keys()):
            chartoidx[litera] = cnt
            cnt = cnt + 1
            
            
chartoidx["<END>"] = cnt

In [19]:
chartoidx

{'a': 0,
 'b': 1,
 'm': 2,
 'i': 3,
 'c': 4,
 'r': 5,
 'o': 6,
 '5': 7,
 '2': 8,
 '7': 9,
 '9': 10,
 'g': 11,
 '0': 12,
 '1': 13,
 'n': 14,
 'k': 15,
 'h': 16,
 'd': 17,
 'u': 18,
 't': 19,
 '8': 20,
 'p': 21,
 '3': 22,
 'l': 23,
 's': 24,
 'z': 25,
 '4': 26,
 'e': 27,
 'w': 28,
 'x': 29,
 'Z': 30,
 '6': 31,
 'C': 32,
 'y': 33,
 'j': 34,
 'f': 35,
 'v': 36,
 'F': 37,
 'P': 38,
 'R': 39,
 'O': 40,
 'T': 41,
 'A': 42,
 'X': 43,
 'L': 44,
 'E': 45,
 'S': 46,
 'I': 47,
 'K': 48,
 'N': 49,
 'D': 50,
 'M': 51,
 'q': 52,
 'H': 53,
 'U': 54,
 'J': 55,
 'Q': 56,
 'G': 57,
 'W': 58,
 'B': 59,
 'V': 60,
 'Y': 61,
 '@': 62,
 '.': 63,
 '#': 64,
 '<END>': 65}

In [20]:
slowatranslated = np.zeros((len(slowa), longestword, len(list(chartoidx.keys())))); slowatranslated.shape

(5000, 15, 66)

In [21]:
outputs = np.zeros((len(slowa), longestword, 1)); outputs.shape

(5000, 15, 1)

In [22]:
word_dimensions = len(list(chartoidx.keys()))
words_dimension = len(slowa)

print("Ilość komórek w warstwie: ", longestword)

print("word_dimensions", word_dimensions)
print("words_dimension", words_dimension)

Ilość komórek w warstwie:  15
word_dimensions 66
words_dimension 5000


In [23]:
for cnt, slowo in enumerate(slowa):
    for cn2, litera in enumerate(slowo):
        slowatranslated[cnt, cn2, chartoidx[litera]] = 1
#         print("ins", litera)
        if cn2+1 < len(slowo):
            outputs[cnt, cn2] = chartoidx[slowo[cn2+1]]
#             print("outs", slowo[cn2+1])
    outputs[cnt, cn2] = chartoidx["<END>"]
#     print("\n\n")

outputs = outputs.reshape(len(slowa), longestword)

In [24]:
print(slowatranslated[0:2])

[[[1. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


In [25]:
print(outputs)

[[ 1.  2.  3. ...  0.  0.  0.]
 [ 8.  9.  7. ...  0.  0.  0.]
 [11.  5.  6. ...  0.  0.  0.]
 ...
 [ 0. 35.  3. ...  0.  0.  0.]
 [12. 12. 10. ...  0.  0.  0.]
 [18.  4. 34. ...  0.  0.  0.]]


In [29]:
class CharacterLSTM(nn.Module):
    def __init__(self, word_dimensions, words_dimension):
        super(CharacterLSTM, self).__init__()
        
        self.word_dimensions = word_dimensions
        self.words_dimension = words_dimension
        
        self.LSTM_11 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_12 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_13 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_14 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_15 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_16 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_17 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_18 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_19 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_110 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_111 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_112 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_113 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_114 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_115 = nn.LSTM(word_dimensions, word_dimensions)
        
        
        self.LSTM_21 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_22 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_23 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_24 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_25 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_26 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_27 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_28 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_28 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_210 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_211 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_212 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_213 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_214 = nn.LSTM(word_dimensions, word_dimensions)
        self.LSTM_215 = nn.LSTM(word_dimensions, word_dimensions)
        
        
        self.drop = nn.Dropout(p=0.01)
        self.norm = nn.LayerNorm(word_dimensions)
        
        
        self.lin_1 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_2 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_3 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_4 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_5 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_6 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_7 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_8 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_9 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_10 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_11 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_12 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_13 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_14 = nn.Linear(word_dimensions, word_dimensions)
        self.lin_15 = nn.Linear(word_dimensions, word_dimensions)
        
    def forward(self, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, hidden):
        # pierwsza warstwa LSTM
        y11_, h11_ = self.LSTM_11(x1, hidden)
        y12_, h12_ = self.LSTM_12(x2, h11_)
        y13_, h13_ = self.LSTM_13(x3, h12_)
        y14_, h14_ = self.LSTM_14(x4, h13_)
        y15_, h15_ = self.LSTM_15(x5, h14_)
        y16_, h16_ = self.LSTM_16(x6, h15_)
        y17_, h17_ = self.LSTM_17(x7, h16_)
        y18_, h18_ = self.LSTM_18(x8, h17_)
        y19_, h19_ = self.LSTM_19(x9, h18_)
        y110_, h110_ = self.LSTM_110(x10, h19_)
        y111_, h111_ = self.LSTM_111(x11, h110_)
        y112_, h112_ = self.LSTM_112(x12, h111_)
        y113_, h113_ = self.LSTM_113(x13, h112_)
        y114_, h114_ = self.LSTM_114(x14, h113_)
        y115_, h115_ = self.LSTM_115(x15, h114_)
        
        
        # residuals
        y11 = y11_ * x1
        y12 = y12_ * x2
        y13 = y13_ * x3
        y14 = y14_ * x4
        y15 = y15_ * x5
        y16 = y16_ * x6
        y17 = y17_ * x7
        y18 = y18_ * x8
        y19 = y19_ * x9
        y110 = y110_ * x10
        y111 = y111_ * x11
        y112 = y112_ * x12
        y113 = y113_ * x13
        y114 = y114_ * x14
        y115 = y115_ * x15
        
        
        # druga warstwa LSTM
        y21_, h21_ = self.LSTM_21(y11, h115_)
        y22_, h22_ = self.LSTM_22(y12, h21_)
        y23_, h23_ = self.LSTM_23(y13, h22_)
        y24_, h24_ = self.LSTM_24(y14, h23_)
        y25_, h25_ = self.LSTM_25(y15, h24_)
        y26_, h26_ = self.LSTM_26(y16, h25_)
        y27_, h27_ = self.LSTM_27(y17, h26_)
        y28_, h28_ = self.LSTM_28(y18, h27_)
        y29_, h29_ = self.LSTM_28(y19, h28_)
        y210_, h210_ = self.LSTM_210(y110, h28_)
        y211_, h211_ = self.LSTM_211(y111, h210_)
        y212_, h212_ = self.LSTM_212(y112, h211_)
        y213_, h213_ = self.LSTM_213(y113, h212_)
        y214_, h214_ = self.LSTM_214(y114, h213_)
        y215_, h215_ = self.LSTM_215(y115, h214_)
        
        
        # residuals
        y21 = self.norm(F.relu(self.drop(y21_ * y11 * x1)))
        y22 = self.norm(F.relu(self.drop(y22_ * y12 * x2)))
        y23 = self.norm(F.relu(self.drop(y23_ * y13 * x3)))
        y24 = self.norm(F.relu(self.drop(y24_ * y14 * x4)))
        y25 = self.norm(F.relu(self.drop(y25_ * y15 * x5)))
        y26 = self.norm(F.relu(self.drop(y26_ * y16 * x6)))
        y27 = self.norm(F.relu(self.drop(y27_ * y17 * x7)))
        y28 = self.norm(F.relu(self.drop(y28_ * y18 * x8)))
        y29 = self.norm(F.relu(self.drop(y29_ * y19 * x9)))
        y210 = self.norm(F.relu(self.drop(y210_ * y110 * x10)))
        y211 = self.norm(F.relu(self.drop(y211_ * y111 * x11)))
        y212 = self.norm(F.relu(self.drop(y212_ * y112 * x12)))
        y213 = self.norm(F.relu(self.drop(y213_ * y113 * x13)))
        y214 = self.norm(F.relu(self.drop(y214_ * y114 * x14)))
        y215 = self.norm(F.relu(self.drop(y215_ * y115 * x15)))
        
        
        
        # linear
        y31 = F.relu(self.lin_1(y21))
        y32 = F.relu(self.lin_2(y22))
        y33 = F.relu(self.lin_3(y23))
        y34 = F.relu(self.lin_4(y24))
        y35 = F.relu(self.lin_5(y25))
        y36 = F.relu(self.lin_6(y26))
        y37 = F.relu(self.lin_7(y27))
        y38 = F.relu(self.lin_8(y28))
        y39 = F.relu(self.lin_9(y29))
        y310 = F.relu(self.lin_10(y210))
        y311 = F.relu(self.lin_11(y211))
        y312 = F.relu(self.lin_11(y212))
        y313 = F.relu(self.lin_11(y213))
        y314 = F.relu(self.lin_11(y214))
        y315 = F.relu(self.lin_11(y215))
                
        
        # zwrot
        return torch.cat((y31, y32, y33, y34, y35, y36, y37, y38, y39, y310, y311, y312, y313, y314, y315), 1).reshape(self.words_dimension, self.word_dimensions, -1)

In [30]:
x1 = slowatranslated[:][:][0][:]; x1.shape

(15, 66)

In [31]:
slowatranslated.shape

(5000, 15, 66)

In [32]:
x1 = []
x2 = []
x3 = []
x4 = []
x5 = []
x6 = []
x7 = []
x8 = []
x9 = []
x10 = []
x11 = []
x12 = []
x13 = []
x14 = []
x15 = []

for slowo in slowatranslated:
    x1.append(slowo[0])
    x2.append(slowo[1])
    x3.append(slowo[2])
    x4.append(slowo[3])
    x5.append(slowo[4])
    x6.append(slowo[5])
    x7.append(slowo[6])
    x8.append(slowo[7])
    x9.append(slowo[8])
    x10.append(slowo[9])
    x11.append(slowo[10])
    x12.append(slowo[11])
    x13.append(slowo[12])
    x14.append(slowo[13])
    x15.append(slowo[14])

In [33]:
x1 = torch.Tensor(np.array(x1)).reshape(1, words_dimension , word_dimensions).cuda()
x2 = torch.Tensor(np.array(x2)).reshape(1, words_dimension , word_dimensions).cuda()
x3 = torch.Tensor(np.array(x3)).reshape(1, words_dimension , word_dimensions).cuda()
x4 = torch.Tensor(np.array(x4)).reshape(1, words_dimension , word_dimensions).cuda()
x5 = torch.Tensor(np.array(x5)).reshape(1, words_dimension , word_dimensions).cuda()
x6 = torch.Tensor(np.array(x6)).reshape(1, words_dimension , word_dimensions).cuda()
x7 = torch.Tensor(np.array(x7)).reshape(1, words_dimension , word_dimensions).cuda()
x8 = torch.Tensor(np.array(x8)).reshape(1, words_dimension , word_dimensions).cuda()
x9 = torch.Tensor(np.array(x9)).reshape(1, words_dimension , word_dimensions).cuda()
x10 = torch.Tensor(np.array(x10)).reshape(1, words_dimension , word_dimensions).cuda()
x11 = torch.Tensor(np.array(x11)).reshape(1, words_dimension , word_dimensions).cuda()
x12 = torch.Tensor(np.array(x12)).reshape(1, words_dimension , word_dimensions).cuda()
x13 = torch.Tensor(np.array(x13)).reshape(1, words_dimension , word_dimensions).cuda()
x14 = torch.Tensor(np.array(x14)).reshape(1, words_dimension , word_dimensions).cuda()
x15 = torch.Tensor(np.array(x15)).reshape(1, words_dimension , word_dimensions).cuda()

In [34]:
outputs

array([[ 1.,  2.,  3., ...,  0.,  0.,  0.],
       [ 8.,  9.,  7., ...,  0.,  0.,  0.],
       [11.,  5.,  6., ...,  0.,  0.,  0.],
       ...,
       [ 0., 35.,  3., ...,  0.,  0.,  0.],
       [12., 12., 10., ...,  0.,  0.,  0.],
       [18.,  4., 34., ...,  0.,  0.,  0.]])

In [35]:
Toutputs = torch.Tensor(outputs).long().cuda()

In [36]:
Toutputs.shape

torch.Size([5000, 15])

In [37]:
x1.shape

torch.Size([1, 5000, 66])

In [38]:
x1

tensor([[[1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], device='cuda:0')

In [39]:
ch_lstm = CharacterLSTM(word_dimensions, words_dimension).cuda()

In [40]:
hiddens = (torch.rand((1,words_dimension, word_dimensions)).cuda(), torch.rand((1,words_dimension, word_dimensions)).cuda())

In [41]:
y_= ch_lstm(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, hiddens).cuda(); y_.shape

torch.Size([5000, 66, 15])

In [42]:
optimizer = optim.Adam(ch_lstm.parameters(), lr=3e-3)

In [43]:
criterion = nn.CrossEntropyLoss()

In [44]:
loss = criterion(y_, Toutputs)

In [52]:
for epoch in range(20000):
    ch_lstm.train()
    optimizer.zero_grad()
    
    y_= ch_lstm(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, hiddens).cuda()
    loss = criterion(y_, Toutputs)
    
    loss.backward()
    optimizer.step()
    
    if epoch%200==0:
        print(loss.item())

12.583521842956543
3.378568410873413
3.3206777572631836
3.306133508682251
3.295006275177002
3.292492151260376
3.2873306274414062
3.2825260162353516
3.2824032306671143
3.277313709259033
3.2766642570495605
3.2745065689086914
3.2725167274475098
3.269181966781616
3.26542067527771
3.265711545944214
3.261781692504883
3.2585501670837402
3.253711700439453
3.253967761993408
3.248784303665161
3.242337942123413
3.2358949184417725
3.233449697494507
3.2313876152038574
3.230433702468872
3.23068904876709
3.2308075428009033
3.2313365936279297
3.229114294052124
3.23813533782959
3.22804594039917
3.2294790744781494
3.2265853881835938
3.22883677482605
3.2254722118377686
3.225127935409546
3.227903127670288
3.225874900817871
3.225904703140259
3.2266130447387695
3.2236311435699463
3.2217202186584473
3.218883991241455
3.219916343688965


KeyboardInterrupt: 

In [None]:
with torch.no_grad():
    hiddens = (torch.randn((1,words_dimension, word_dimensions)).cuda(), torch.randn((1,words_dimension, word_dimensions)).cuda())
    y_= ch_lstm(x1, x1, x1, x1, x1, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, hiddens).cuda()
    
    
    for row in y_.argmax(1).detach().cpu().numpy():
        slowo = ""
        for item in row[:100]:
            slowo = slowo + list(chartoidx.keys())[item]
        print(slowo)

# WNIOSKI

Pomysł jest prawie dobry, ale występuje w nim jeden błąd. Raczej nie powinno się robić osobnych sieci liniowych na każdy ze znaków, a bardziej zrobić jedną sieć obejmującą wszystkie wyjścia.