In [57]:
import re
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader 
import torch.nn as nn
from torch.optim import Adam
import numpy as np
import json
import matplotlib.pyplot as plt

In [58]:
def chunk_sentence(sentence, max_words=15):
    words = sentence.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunk = words[i:i+max_words]
        chunks.append(' '.join(chunk))
    return chunks

In [59]:
bk1 = open('book1.txt', 'r', encoding='utf-8').read().replace('\n', ' ').split('.')
bk1[1:3]

['  Everything was in confusion in the Oblonskys’ house',
 ' The wife had discovered that the husband was carrying on an intrigue with a French girl, who had been a governess in their family, and she had announced to her husband that she could not go on living in the same house with him']

In [60]:
bk2 = open('book2.txt', 'r', encoding='utf-8').read().replace('\n', ' ').split('.')
bk2[1:3]

[' It stands in the centre of a small bay, and upon a gentle acclivity, which, on one side, slopes towards the sea, and on the other rises into an eminence crowned by dark woods',
 ' The situation is admirably beautiful and picturesque, and the ruins have an air of ancient grandeur, which, contrasted with the present solitude of the scene, impresses the traveller with awe and curiosity']

In [61]:
raw_sentences = bk1 + bk2
sentences = []
max_words = 20
for sentence in raw_sentences:
    chunks = chunk_sentence(sentence, max_words=max_words)
    for c in chunks:
        if len(c) > 2:
            sentences.append(c)

In [62]:
sentences[:2]

['PART ONE Chapter 1 Happy families are all alike; every unhappy family is unhappy in its own way',
 'Everything was in confusion in the Oblonskys’ house']

In [63]:
len(sentences)

13364

In [64]:
def clean_text(text):
    text = text.lower()
    # Add spaces around punctuation
    text = re.sub(r'([^\w\s])', r' \1 ', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    # Normalize spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [65]:
sentences = [clean_text(txt) for txt in sentences]
len(sentences)

13364

In [66]:
sentences[1:3]

['everything was in confusion in the oblonskys house',
 'the wife had discovered that the husband was carrying on an intrigue with a french girl who had been a']

In [67]:
all_word_list = [txt for txt in ("".join(sentences)).split(' ')]
all_word_list[:3]

['part', 'one', 'chapter']

In [68]:
w_to_i = {word:i+2 for i, word in enumerate(set(all_word_list))}
w_to_i['<UNK>'] = 1
i_to_w = {v:k for k, v in w_to_i.items()}
with open('vocab.json', 'w') as fp:
    json.dump(w_to_i, fp)

In [69]:
vocab_size = len(set(w_to_i.values())) +1 #+1 for padding "0"
vocab_size

20668

In [70]:
def sentence_to_int(sentence):
    sentence_to_int = []
    for word in sentence.split(" "):
        int_word = w_to_i.get(word, 0)
        sentence_to_int.append(int_word)
    return sentence_to_int

In [15]:
sentence_to_int('everything was in confusion in the oblonskys house')

[6969, 15543, 21121, 5419, 21121, 483, 15751, 14443]

In [16]:
int_sentences = [sentence_to_int(sentence) for sentence in sentences]
int_sentences[1]

[6969, 15543, 21121, 5419, 21121, 483, 15751, 14443]

In [17]:
len(int_sentences)

13364

In [18]:
max(len(x) for x in int_sentences)

20

In [19]:
input_sq = []
output_sq = []
for sentence in int_sentences:
    for i in range(1, len(sentence)):
        temp = sentence[:i]
        pad = (0, max_words - len(temp))
        if temp != None:
            try:
                padded = F.pad(torch.tensor(temp), pad, value= 0)
            except Exception as e:
                print(e)
                print(sentence)
                print(temp)
                break
        
            input_sq.append(padded)
            output_sq.append(sentence[i])

In [27]:
len(input_sq)

169326

In [28]:
len(output_sq)

169326

In [6]:
class mydata(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.x = x
        self.y = y 
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [None]:
torch.manual_seed(10)
data = mydata(input_sq, torch.tensor(output_sq))
batched_data= DataLoader(data, 128, shuffle=True)

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embedding_dim=100, padding_idx = 0)
        self.do = nn.Dropout(.3)
        self.lstm = nn.LSTM(input_size = 100, hidden_size=150, num_layers=2)
        self.layers = nn.Sequential(
            nn.Linear(in_features=150, out_features=200),
            nn.Dropout(.2),
            nn.ReLU(),
            nn.Linear(in_features=200, out_features=vocab_size)
        )
    def forward(self, x):
        x = self.emb(x)
        y = self.do(x)
        h, c = self.lstm(x)
        y = h[:, -1, :]
        y = self.layers(y)

        return y

In [None]:
def train(model, epochs, optim, loss_fn):
    model.train()
    best_loss = float('inf')
    epochs_loss = []
    for epoch in range(epochs):
        loss_ = []
        for batch in batched_data:
            x, y = batch
            x, y  = (x).to(device), (y).to(device) 
            y_pred = model(x)

            loss = loss_fn(y_pred, y)
            loss.backward()
            optim.step()
            optim.zero_grad()

            loss_.append(loss.item())
            
        epoch_loss = np.mean(loss_)
        epochs_loss.append(epoch_loss)
        print(f"Epochs: {epoch} | Loss: {epoch_loss:.4f}")

        if best_loss > epoch_loss:
            best_loss = epoch_loss
            torch.save(model.state_dict(), "best_model.pt")
            print(f"model saved with loss: {best_loss:.4f}")
            
    return epochs_loss

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = (Model()).to(device)
optim = Adam(model.parameters(), lr = 0.001)
loss_fn = nn.CrossEntropyLoss(ignore_index=0) #ignoring padding
loss = train(model, 50, optim, loss_fn) 

In [None]:
plt.plot(loss, [i for i in range(len(loss))])
plt.title("loss Vs epoch")
plt.xlabel("Loss")
plt.ylabel("Epoch")
plt.show()