# RNN

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

from collections import Counter
import numpy as np
from tqdm import tqdm
import requests
import re

In [None]:
url = "https://www.gutenberg.org/files/100/100-0.txt"
response = requests.get(url)
shakespeare_text = response.text

shakespeare_text = re.sub(r'\s+', ' ', shakespeare_text)

In [None]:
len(shakespeare_text)

In [None]:
pprint(shakespeare_text[:1500])

In [None]:
pattern = re.compile("THE TRAGEDY OF HAMLET, PRINCE OF DENMARK")
matches = pattern.finditer(shakespeare_text)
for match in matches:
    print(match)

In [None]:
pattern = re.compile("THE FIRST PART OF KING HENRY THE FOURTH")
matches = pattern.finditer(shakespeare_text)
for match in matches:
    print(match)

In [None]:
hamlet_text = shakespeare_text[927066:1105544]

In [None]:
pprint(hamlet_text)

In [None]:
# 단어 단위로 토큰화

words = re.findall(r'\b\w+\b', hamlet_text.lower()) # 소문자로 변환
vocab = Counter(words) # 단어 빈도수 계산
word_to_ix = {word: i for i, (word, _) in enumerate(vocab.items())} # 단어를 index로 변환
ix_to_word = {i: word for word, i in word_to_ix.items()} # index를 단어로 변환

encoded_text = [word_to_ix[word] for word in words] # encoded_text: 단어를 index로 변환한 리스트

class TextDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        # input sequence와 target sequence를 만들 수 있는 최대 길이: 전체 데이터 길이 - seq_length
        return 

    def __getitem__(self, idx):
        # input sequence: (idx)개의 단어로 이루어진 sequence
        input_seq = 
        # target sequence: (idx+1)번째 단어부터 (idx+1+seq_length)번째 단어까지로 이루어진 sequence
        target_seq = 
        
        # input_seq, target_seq를 torch tensor로 변환
        return 

## RNN

In [None]:
class RNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNNCell, self).__init__()
        self.hidden_size =
        self.i2h = 
        self.h2o =
        self.tanh =

    def forward(self, input, hidden):
        # input과 hidden을 concat
        ## concat: (batch_size, 1, input_size + hidden_size)
        combined =
        # W_hh * hidden + W_xh * input
        hidden = 
        # tanh(W_hh * hidden + W_xh * input)
        output = 
        return output, hidden

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_dim):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.rnn_cell = RNNCell(embedding_dim, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input) # (batch_size, seq_length, embedding_dim)
        outputs = []
        for i in range(embedded.size(1)): # seq_length 만큼 반복
            output, hidden = self.rnn_cell(embedded[:, i].unsqueeze(1), hidden) # (batch_size, 1, embedding_dim)
            outputs.append(output) # (batch_size, 1, hidden_size)
        return torch.cat(outputs, dim=1), hidden # (batch_size, seq_length, hidden_size)

    def initHidden(self, batch_size, device):
        return torch.zeros(batch_size, 1, self.hidden_size, device=device) # hidden cell 초기화

## Training

In [None]:
# Hyperparameters
seq_length = 5 # input sequence 길이(단어 개수)
batch_size = 1024 # 배치 크기
embedding_dim = 128 # 단어 임베딩 차원
hidden_size = 256 # hidden state 차원
learning_rate = 0.001
epochs = 50

# Model, Loss, Optimizer 생성
model = 
criterion = 
optimizer = 

if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

# TextDataset 생성
dataset =

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# train_loader, test_loader 생성
train_loader = 
test_loader = 

In [None]:
def train_and_test(model, train_loader, test_loader, criterion, optimizer, epochs, device):
    # 모델을 device에 할당
    
    
    # epochs만큼 반복
    for 
        # Training
        # model을 train 모드로 설정
        
        # loss 초기화
        
        # train_loader를 이용해 학습
        for inputs, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Training]", leave=False):
            # inputs, targets를 device에 할당
            
            # hidden cell, gradient 초기화
            hidden = model.initHidden(inputs.size(0), device)
            model.zero_grad()
            
            # model에 inputs를 넣어 output과 hidden을 계산
            outputs, hidden = 
            
            # loss 계산 및 역전파
            loss = criterion(outputs.view(-1, outputs.size(2)), targets.view(-1))
            loss.
            optimizer.

            # loss 누적
            train_loss
        
        # 평균 train loss 계산(epoch 당)
        avg_train_loss =
        
        # Testing
        # model을 eval 모드로 설정
        
        # loss 초기화
        
        # gradient 계산 비활성화
        
            # test_loader를 이용해 테스트
            for inputs, targets in tqdm(test_loader, desc=f"Epoch {epoch+1}/{epochs} [Testing]", leave=False):
                # inputs, targets를 device에 할당
                
                # hidden cell 초기화
                hidden = model.initHidden(inputs.size(0), device)
                
                # model에 inputs를 넣어 output과 hidden을 계산
                outputs, hidden = 
                
                # loss 계산 및 누적
                loss = criterion(outputs.view(-1, outputs.size(2)), targets.view(-1))
                test_loss
                
        # 평균 test loss 계산(epoch 당)
        avg_test_loss = 

        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}')

train_and_test(model, train_loader, test_loader, criterion, optimizer, epochs, device)

## Prediction

In [None]:
def predict(model, input_text, word_to_ix, ix_to_word, device, predict_len=100):
    model.eval()
    input_seq = [word_to_ix[word] for word in input_text.split() if word in word_to_ix]
    input_tensor = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0).to(device)

    hidden = model.initHidden(1, device)

    predicted_text = input_text

    for _ in range(predict_len):
        with torch.no_grad():
            output, hidden = model(input_tensor, hidden)
            output = output[:, -1, :]
            output_dist = torch.softmax(output.view(-1), dim=0).cpu().numpy()

        try:
            predicted_word_idx = np.random.choice(len(word_to_ix), p=output_dist)
        except ValueError:
            predicted_word_idx = np.random.choice(len(word_to_ix))

        predicted_word = ix_to_word[predicted_word_idx]

        predicted_text += ' ' + predicted_word

        input_tensor = torch.cat((input_tensor, torch.tensor([[predicted_word_idx]], dtype=torch.long).to(device)), dim=1)

    return predicted_text

input_text = "to be or not to be that is the question"
predicted_output = predict(model, input_text, word_to_ix, ix_to_word, device)
print(predicted_output)