<a href="https://colab.research.google.com/github/BDH-teacher/Deep_Learning_Audit_code/blob/main/Weight_Initialization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
np.random.seed(0)

# Weight Initialization
class SimpleNN:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights and biases
        self.W1 = np.random.rand(input_size, hidden_size) * 0.01  # Input to hidden weights
        self.b1 = np.zeros((1, hidden_size))  # Hidden layer biases
        self.W2 = np.random.rand(hidden_size, output_size) * 0.01  # Hidden to output weights
        self.b2 = np.zeros((1, output_size))  # Output layer biases

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def sigmoid_derivative(self, z):
        return z * (1 - z)


In [2]:
# Up until now...
def preprocess_text(text: str):
    # 아주 간단한 전처리: 소문자 + 알파벳/공백만 남기기
    text = text.lower()
    text = ''.join(ch if (ch.isalnum() or ch.isspace()) else ' ' for ch in text)
    tokens = [t for t in text.split() if t]
    return tokens

def create_vocabulary(tokenized_sentences):
    # <pad>를 0으로 두면, 뒤에서 padding 처리/마스킹이 편함
    vocab = ["<pad>", "<unk>"]
    words = sorted({w for sent in tokenized_sentences for w in sent})
    vocab += words
    word_to_index = {w: i for i, w in enumerate(vocab)}
    return vocab, word_to_index

def encode_text(tokenized_sentences, word_to_index):
    max_len = max(len(s) for s in tokenized_sentences)
    X = []
    for sent in tokenized_sentences:
        ids = [word_to_index.get(w, word_to_index["<unk>"]) for w in sent]
        # pad
        ids = ids + [word_to_index["<pad>"]] * (max_len - len(ids))
        X.append(np.array(ids, dtype=np.int64))
    return X

# Sample dataset
sentences = [
    "I love programming",
    "Python is a great language",
    "I enjoy learning new technologies",
    "I hate bugs in code",
    "Debugging is fun",
    "I dislike syntax errors",
]

# Corresponding labels (1 for positive sentiment, 0 for negative)
labels = np.array([[1], [1], [1], [0], [1], [0]])

# Preprocess the text
sentences = [preprocess_text(sentence) for sentence in sentences]

# Create vocabulary and mappings
vocabulary, word_to_index = create_vocabulary(sentences)

# Encode sentences into input IDs and convert into array of input_ids
X = encode_text(sentences, word_to_index)
X = np.array([np.array(xi) for xi in X])

print("X.shape:", X.shape, "| labels.shape:", labels.shape)
print("vocab_size:", len(vocabulary), "| example vocab:", vocabulary[:10])

X.shape: (6, 5) | labels.shape: (6, 1)
vocab_size: 23 | example vocab: ['<pad>', '<unk>', 'a', 'bugs', 'code', 'debugging', 'dislike', 'enjoy', 'errors', 'fun']


In [3]:
# Initialize Embedding Weight / from scratch

class SimpleNN:
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        # Initialize weights and biases
        self.embedding_weights = np.random.rand(vocab_size, embedding_dim) * 0.01  # Embedding weights
        self.W1 = np.random.rand(embedding_dim, hidden_size) * 0.01  # Hidden layer weights
        self.b1 = np.zeros((1, hidden_size))  # Hidden layer biases
        self.W2 = np.random.rand(hidden_size, output_size) * 0.01  # Output layer weights
        self.b2 = np.zeros((1, output_size))  # Output layer biases

        # 패딩 id는 <pad>=0으로 가정 (위 vocab 생성이 그렇게 만들어둠)
        self.pad_id = 0

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def sigmoid_derivative(self, z):
        return z * (1 - z)

    def forward(self, X):
        # Forward pass
        # ...
        # return prediction

        # X: (batch, seq_len) token ids
        emb = self.embedding_weights[X]  # (batch, seq_len, embedding_dim)

        mask = (X != self.pad_id).astype(np.float32)  # (batch, seq_len)
        counts = mask.sum(axis=1, keepdims=True)      # (batch, 1)
        counts[counts == 0] = 1.0

        # masked mean pooling
        x_emb = (emb * mask[..., None]).sum(axis=1) / counts  # (batch, embedding_dim)

        z1 = x_emb.dot(self.W1) + self.b1
        h1 = self.sigmoid(z1)

        z2 = h1.dot(self.W2) + self.b2
        prediction = self.sigmoid(z2)  # (batch, 1) 확률

        # backward에서 쓰려고 캐시
        self._cache = {
            "X": X,
            "mask": mask,
            "counts": counts,
            "x_emb": x_emb,
            "h1": h1,
            "prediction": prediction,
        }
        return prediction

    def compute_loss(self, y, output):
        # Compute binary cross-entropy loss
        m = y.shape[0]  # Number of samples
        return -np.sum(y * np.log(output + 1e-15) + (1 - y) * np.log(1 - output + 1e-15)) / m

    def backward(self, X, y, output, learning_rate=0.01):
        #Back Propagation (1. Calculate gradient 2. Update weights and bias using gradient descent)
        # ...

        m = y.shape[0]
        cache = self._cache
        x_emb = cache["x_emb"]      # (m, embedding_dim)
        h1 = cache["h1"]            # (m, hidden)
        mask = cache["mask"]        # (m, seq_len)
        counts = cache["counts"]    # (m, 1)

        # BCE + sigmoid output이면 dL/dz2 = (pred - y)/m
        dz2 = (output - y) / m                 # (m, 1)
        dW2 = h1.T.dot(dz2)                    # (hidden, 1)
        db2 = dz2.sum(axis=0, keepdims=True)   # (1, 1)

        dh1 = dz2.dot(self.W2.T)               # (m, hidden)
        dz1 = dh1 * self.sigmoid_derivative(h1)  # (m, hidden)

        dW1 = x_emb.T.dot(dz1)                 # (emb_dim, hidden)
        db1 = dz1.sum(axis=0, keepdims=True)   # (1, hidden)

        dx_emb = dz1.dot(self.W1.T)            # (m, emb_dim)

        # embedding gradient: mean pooling을 각 토큰에 분배
        grad_embedding = np.zeros_like(self.embedding_weights)
        seq_len = X.shape[1]
        for i in range(m):
            denom = counts[i, 0]
            for j in range(seq_len):
                tid = X[i, j]
                if tid == self.pad_id:
                    continue
                grad_embedding[tid] += dx_emb[i] / denom

        # gradient descent update
        self.embedding_weights -= learning_rate * grad_embedding
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2

In [4]:
# Up until now... (SimpleNN 인스턴스 생성)

# Initialize the neural network
vocab_size = len(vocabulary)  # Number of unique words
embedding_dim = 5  # Embedding dimension
hidden_size = 5  # Number of neurons in the hidden layer
output_size = 1  # One output (binary classification)

nn = SimpleNN(vocab_size, embedding_dim, hidden_size, output_size)


# What's next? Write Train & Test APIs!
def train(X, y, epochs=1000, learning_rate=0.01):
    for epoch in range(epochs):
        output = nn.forward(X)  # Forward pass
        loss = nn.compute_loss(y, output)  # Compute loss
        nn.backward(X, y, output, learning_rate)  # Backward pass

        if epoch % 100 == 0:  # Print loss every 100 epochs
            print(f'Epoch {epoch}, Loss: {loss}')

def predict(X):
    output = nn.forward(X)
    return (output > 0.5).astype(int)  # Binary classification

train(X,labels)
predictions = predict(X)
print(predictions)
print(labels)

Epoch 0, Loss: 0.6903513296925704
Epoch 100, Loss: 0.6545826684507242
Epoch 200, Loss: 0.642744689530779
Epoch 300, Loss: 0.6387065442840889
Epoch 400, Loss: 0.637295732687321
Epoch 500, Loss: 0.6367948569386922
Epoch 600, Loss: 0.6366151983723816
Epoch 700, Loss: 0.63655033053068
Epoch 800, Loss: 0.6365267987304426
Epoch 900, Loss: 0.636518221875147
[[1]
 [1]
 [1]
 [1]
 [1]
 [1]]
[[1]
 [1]
 [1]
 [0]
 [1]
 [0]]


In [5]:
# PyTorch RNN + DataLoader + Train/Test API

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

torch.manual_seed(0)
vocab_to_int = word_to_index


# Code Implementation: Define a RNN using Pytorch
# Define RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNNModel, self).__init__()
        # Define Embedding
        self.embedding = nn.Embedding(len(vocab_to_int), input_size, padding_idx=vocab_to_int['<pad>'])

        # Define the RNN layer
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)

        # Define the fully connected layer to produce outputs
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedding = self.embedding(x)

        # Forward propagate the RNN
        out, hidden = self.rnn(embedding)

        # Take the output from the last time step
        out = self.fc(out[:, -1, :])
        return out

# Dataset / split
class TextDataset(Dataset):
    def __init__(self, texts_tensor, labels_tensor):
        self.texts = texts_tensor
        self.labels = labels_tensor

    def __len__(self):
        return self.texts.size(0)

    def __getitem__(self, idx):
        input_ids = self.texts[idx]
        attention_masks = (input_ids != vocab_to_int["<pad>"]).long()
        segment_ids = torch.zeros_like(input_ids)
        labels = self.labels[idx]
        return input_ids, attention_masks, segment_ids, labels


# numpy X/labels -> torch
X_torch = torch.tensor(X, dtype=torch.long)
y_torch = torch.tensor(labels.squeeze(1), dtype=torch.long)  # (N,) with 0/1

# train/dev/test
train_texts, train_labels = X_torch[:4], y_torch[:4]
dev_texts, dev_labels     = X_torch[4:5], y_torch[4:5]
test_texts, test_labels   = X_torch[5:],  y_torch[5:]

In [6]:
# Data Preparation & Create a Model Instance using Pytorch

train_dataset = TextDataset(train_texts, train_labels)
dev_dataset = TextDataset(dev_texts, dev_labels)
test_dataset = TextDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, 8, shuffle=True)
dev_loader = DataLoader(dev_dataset, 8, shuffle=True)
test_loader = DataLoader(test_dataset, 8, shuffle=True)

# Define a loss
criterion = nn.CrossEntropyLoss()

# Example usage
input_size = 10  # Number of input features per time step
hidden_size = 20  # Number of features in the hidden state
num_layers = 2  # Number of RNN layers (stacked)
output_size = 2  # Number of output classes

# Create RNN model
model = RNNModel(input_size, hidden_size, num_layers, output_size)

In [7]:
# Train & Test API using Pytorch (Example)
max_grad_norm = 1.0

# Train API
def train(loader):
    model.train()
    optimizer = SGD(model.parameters(), lr=0.1)

    train_loss = 0.0
    for data in loader:
        input_ids, attention_masks, segment_ids, labels = data

        optimizer.zero_grad()  # (실행 위해 최소 보완: gradient 누적 방지)

        out = model(input_ids)
        loss = criterion(out, labels)
        loss.backward()
        train_loss += loss.item()
        if max_grad_norm > 0.:
            nn.utils.clip_grad_norm_(model.parameters(),max_grad_norm)

        optimizer.step()
    return train_loss/len(train_loader)

# Test API
def evaluate(loader):
    model.eval()
    eval_loss = 0.
    eval_acc = 0.
    y_true, y_pred = [], []
    loader = tqdm(loader, leave=False)

    for i, data in enumerate(loader):
        with torch.no_grad():
            input_ids, attention_masks, segment_ids, labels = data
            output = model(input_ids)
            for j in range(output.size(0)):
                y_pred.append(output[j].argmax().item())
                y_true.append(labels[j].item())

            loss = criterion(output,labels)
            eval_loss += loss.item()
            loader.set_description(f'eval loss = {(eval_loss / (i+1)):.6f}')
    eval_acc = accuracy_score(y_true, y_pred) * 100.
    return eval_loss / len(loader), eval_acc

for epoch in range(100):
    tr_loss = train(train_loader)
    te_loss, te_acc = evaluate(test_loader)
    print(f"[PyTorch RNN] epoch={epoch:02d} train_loss={tr_loss:.4f} | test_loss={te_loss:.4f} test_acc={te_acc:.1f}%")

  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=00 train_loss=0.5987 | test_loss=0.8547 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=01 train_loss=0.5570 | test_loss=0.8862 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=02 train_loss=0.5198 | test_loss=0.9149 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=03 train_loss=0.4850 | test_loss=0.9414 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=04 train_loss=0.4515 | test_loss=0.9659 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=05 train_loss=0.4186 | test_loss=0.9885 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=06 train_loss=0.3862 | test_loss=1.0090 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=07 train_loss=0.3543 | test_loss=1.0275 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=08 train_loss=0.3232 | test_loss=1.0439 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=09 train_loss=0.2931 | test_loss=1.0583 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=10 train_loss=0.2645 | test_loss=1.0707 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=11 train_loss=0.2377 | test_loss=1.0815 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=12 train_loss=0.2130 | test_loss=1.0907 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=13 train_loss=0.1905 | test_loss=1.0986 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=14 train_loss=0.1703 | test_loss=1.1055 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=15 train_loss=0.1524 | test_loss=1.1113 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=16 train_loss=0.1366 | test_loss=1.1164 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=17 train_loss=0.1228 | test_loss=1.1208 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=18 train_loss=0.1107 | test_loss=1.1245 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=19 train_loss=0.1002 | test_loss=1.1276 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=20 train_loss=0.0910 | test_loss=1.1302 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=21 train_loss=0.0829 | test_loss=1.1324 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=22 train_loss=0.0759 | test_loss=1.1341 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=23 train_loss=0.0697 | test_loss=1.1355 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=24 train_loss=0.0643 | test_loss=1.1365 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=25 train_loss=0.0594 | test_loss=1.1372 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=26 train_loss=0.0552 | test_loss=1.1376 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=27 train_loss=0.0514 | test_loss=1.1377 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=28 train_loss=0.0479 | test_loss=1.1377 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=29 train_loss=0.0449 | test_loss=1.1375 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=30 train_loss=0.0421 | test_loss=1.1370 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=31 train_loss=0.0397 | test_loss=1.1365 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=32 train_loss=0.0374 | test_loss=1.1358 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=33 train_loss=0.0354 | test_loss=1.1349 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=34 train_loss=0.0335 | test_loss=1.1340 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=35 train_loss=0.0318 | test_loss=1.1330 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=36 train_loss=0.0302 | test_loss=1.1319 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=37 train_loss=0.0288 | test_loss=1.1307 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=38 train_loss=0.0275 | test_loss=1.1295 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=39 train_loss=0.0263 | test_loss=1.1282 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=40 train_loss=0.0251 | test_loss=1.1268 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=41 train_loss=0.0241 | test_loss=1.1254 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=42 train_loss=0.0231 | test_loss=1.1240 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=43 train_loss=0.0222 | test_loss=1.1226 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=44 train_loss=0.0213 | test_loss=1.1211 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=45 train_loss=0.0205 | test_loss=1.1196 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=46 train_loss=0.0198 | test_loss=1.1180 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=47 train_loss=0.0191 | test_loss=1.1165 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=48 train_loss=0.0184 | test_loss=1.1149 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=49 train_loss=0.0178 | test_loss=1.1134 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=50 train_loss=0.0172 | test_loss=1.1118 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=51 train_loss=0.0167 | test_loss=1.1102 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=52 train_loss=0.0162 | test_loss=1.1086 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=53 train_loss=0.0157 | test_loss=1.1070 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=54 train_loss=0.0152 | test_loss=1.1054 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=55 train_loss=0.0148 | test_loss=1.1038 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=56 train_loss=0.0144 | test_loss=1.1022 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=57 train_loss=0.0140 | test_loss=1.1006 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=58 train_loss=0.0136 | test_loss=1.0990 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=59 train_loss=0.0132 | test_loss=1.0974 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=60 train_loss=0.0129 | test_loss=1.0959 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=61 train_loss=0.0126 | test_loss=1.0943 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=62 train_loss=0.0122 | test_loss=1.0927 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=63 train_loss=0.0119 | test_loss=1.0911 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=64 train_loss=0.0117 | test_loss=1.0896 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=65 train_loss=0.0114 | test_loss=1.0880 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=66 train_loss=0.0111 | test_loss=1.0865 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=67 train_loss=0.0109 | test_loss=1.0849 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=68 train_loss=0.0106 | test_loss=1.0834 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=69 train_loss=0.0104 | test_loss=1.0819 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=70 train_loss=0.0102 | test_loss=1.0803 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=71 train_loss=0.0099 | test_loss=1.0788 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=72 train_loss=0.0097 | test_loss=1.0773 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=73 train_loss=0.0095 | test_loss=1.0758 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=74 train_loss=0.0094 | test_loss=1.0744 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=75 train_loss=0.0092 | test_loss=1.0729 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=76 train_loss=0.0090 | test_loss=1.0714 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=77 train_loss=0.0088 | test_loss=1.0700 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=78 train_loss=0.0086 | test_loss=1.0685 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=79 train_loss=0.0085 | test_loss=1.0671 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=80 train_loss=0.0083 | test_loss=1.0656 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=81 train_loss=0.0082 | test_loss=1.0642 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=82 train_loss=0.0080 | test_loss=1.0628 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=83 train_loss=0.0079 | test_loss=1.0614 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=84 train_loss=0.0078 | test_loss=1.0600 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=85 train_loss=0.0076 | test_loss=1.0586 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=86 train_loss=0.0075 | test_loss=1.0573 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=87 train_loss=0.0074 | test_loss=1.0559 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=88 train_loss=0.0073 | test_loss=1.0545 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=89 train_loss=0.0071 | test_loss=1.0532 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=90 train_loss=0.0070 | test_loss=1.0519 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=91 train_loss=0.0069 | test_loss=1.0505 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=92 train_loss=0.0068 | test_loss=1.0492 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=93 train_loss=0.0067 | test_loss=1.0479 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=94 train_loss=0.0066 | test_loss=1.0466 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=95 train_loss=0.0065 | test_loss=1.0453 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=96 train_loss=0.0064 | test_loss=1.0440 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=97 train_loss=0.0063 | test_loss=1.0427 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=98 train_loss=0.0062 | test_loss=1.0415 test_acc=0.0%


  0%|          | 0/1 [00:00<?, ?it/s]

[PyTorch RNN] epoch=99 train_loss=0.0061 | test_loss=1.0402 test_acc=0.0%
