In [1]:
import numpy as np
import torch

In [2]:
import os

In [3]:
import zipfile

zip_path = "forms.zip"
corpus = []

print("Loading poems in corpus...\n")

with zipfile.ZipFile(zip_path, "r") as z:
    for filename in z.namelist():
        if filename.endswith(".txt"):  # Ensure it's a text file
            print(f"Processing: {filename}")  # Print filename before reading
            with z.open(filename) as file:
                corpus.extend(file.read().decode("utf-8").split("\n"))


Loading poems in corpus...

Processing: abc/AbcPoems2AbcHkAndChinaV2Cauchy3Poembycheungshunsang.txt
Processing: abc/AbcPoems887LiveWithLoveAnAbcPoemPoembyMelvinaGermain.txt
Processing: abc/AbcPoemsAAbcAnglesOnAngelsPoemByCauchy3Poembycheungshunsang.txt
Processing: abc/AbcPoemsAAbcBrazilDancePoemByCauchy3Poembycheungshunsang.txt
Processing: abc/AbcPoemsAbc123PoembyGabriellaFranco.txt
Processing: abc/AbcPoemsAbc2C81107PoembyCoreyThreet.txt
Processing: abc/AbcPoemsAbc3SonnetsPoembyPeterSQuinn.txt
Processing: abc/AbcPoemsAbcBogenPoembyHansChristianAndersen.txt
Processing: abc/AbcPoemsAbcC51507PoembyCoreyThreet.txt
Processing: abc/AbcPoemsAbcJustForFunPoembyDawnSlanker.txt
Processing: abc/AbcPoemsAbcNameSongPoembyjayleebranch.txt
Processing: abc/AbcPoemsAbcOfAliensAndAllPoembyRameshTA.txt
Processing: abc/AbcPoemsAbcOfFaithPoembyChampsUlyssesCabinatan.txt
Processing: abc/AbcPoemsAbcOfMyChildhoodPoembyMehreenTahir.txt
Processing: abc/AbcPoemsAbcOfPoliticalGamePoembyRameshTA.txt
Processing: ab

In [4]:
len(corpus)

169386

In [5]:
with open("Poems.txt", "w") as file:
    for line in corpus:
        file.write(line + "\n")

In [6]:
corpus[:20]

['2 ABC of H.k. and China revised vision.\r',
 'Barrels tears are wines and salts.\r',
 'With a whisk on goody tails!\r',
 'Wiggle maces to fix the heads.\r',
 'Heads in jack on boxes are ceased.\r',
 'Cry to paranoid truly bosses.\r',
 'Bosses are jokers take your boys.\r',
 'Studs are bogs with fire apples.\r',
 'True predicates worth cases.’\r',
 'Descents wash in badly bands.\r',
 'Wholly sales are smart with cats.\r',
 'Who got tenth honors in China?\r',
 'Homage grand to play and plays!\r',
 'Trim the times of hearts then cry.\r',
 'Tanks in steels but voice wail.\r',
 'Bossy dragged by tails that whisked.\r',
 'Go very timid and love the wise.\r',
 'Hands are lent but laws are ends.\r',
 'Cases on courts are borrowed lands.\r',
 'Length long with treads to retch!\r']

In [7]:
import string

def remove_punc(s):
    return s.translate(str.maketrans('', '', string.punctuation))

corpus = [ remove_punc(s.lower().strip()) for s in corpus ]


In [8]:
corpus[:20]

['2 abc of hk and china revised vision',
 'barrels tears are wines and salts',
 'with a whisk on goody tails',
 'wiggle maces to fix the heads',
 'heads in jack on boxes are ceased',
 'cry to paranoid truly bosses',
 'bosses are jokers take your boys',
 'studs are bogs with fire apples',
 'true predicates worth cases’',
 'descents wash in badly bands',
 'wholly sales are smart with cats',
 'who got tenth honors in china',
 'homage grand to play and plays',
 'trim the times of hearts then cry',
 'tanks in steels but voice wail',
 'bossy dragged by tails that whisked',
 'go very timid and love the wise',
 'hands are lent but laws are ends',
 'cases on courts are borrowed lands',
 'length long with treads to retch']

In [9]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [11]:
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 77610


In [12]:
n_grams = []
max_sequence_len = 0

for sentence in corpus:
    # convert sentence to tokens
    tokens = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(2, len(tokens)+1):
        # extract n-gram
        n_gram = tokens[:i]
        # save n-gram
        n_grams.append(n_gram)
        # calculate maximum sequence length
        if len(n_gram) > max_sequence_len:
            max_sequence_len = len(n_gram)
        
print(f"Number of n-grams: {len(n_grams)}")
print(f"Maximum n-gram length: {max_sequence_len}")

Number of n-grams: 1028642
Maximum n-gram length: 10418


In [13]:
for n_gram in n_grams[:20]:
    print(n_gram)

[682, 3138]
[682, 3138, 3]
[682, 3138, 3, 35513]
[682, 3138, 3, 35513, 2]
[682, 3138, 3, 35513, 2, 2637]
[682, 3138, 3, 35513, 2, 2637, 1449]
[682, 3138, 3, 35513, 2, 2637, 1449, 1279]
[11264, 223]
[11264, 223, 24]
[11264, 223, 24, 11265]
[11264, 223, 24, 11265, 2]
[11264, 223, 24, 11265, 2, 19540]
[8, 5]
[8, 5, 35514]
[8, 5, 35514, 16]
[8, 5, 35514, 16, 16388]
[8, 5, 35514, 16, 16388, 5767]
[24599, 24600]
[24599, 24600, 4]
[24599, 24600, 4, 2525]


In [14]:
padded_n_grams = np.array(pad_sequences(n_grams, maxlen=200, padding="post", truncating="pre"))

padded_n_grams.shape


(1028642, 200)

In [15]:
for seq in padded_n_grams[:3]:
    print(seq)

[ 682 3138    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [16]:
X = padded_n_grams[:, :-1]
y = padded_n_grams[:, -1]

print(f"X: {X.shape}")
print(f"y: {y.shape}")

X: (1028642, 199)
y: (1028642,)


In [17]:
Y_tensor = torch.tensor(y)

In [18]:
from torch.utils.data import TensorDataset, DataLoader
def create_lm_sequences(tokenized_padded_text, seq_length):
    # Convert to numpy array if not already
    tokens = np.array(tokenized_padded_text)
    
    # Create X by taking all tokens except last
    x = tokens[:-1]
    
    # Create Y by taking all tokens except first
    y = tokens[1:]
    
    return x, y


x1, y1 = create_lm_sequences(padded_n_grams, seq_length=6)

In [19]:
X1=torch.tensor(x1)
Y1=torch.tensor(y1)
X1=X1.long()
Y1=Y1.long()

In [20]:
dataset = TensorDataset(X1, Y1)

In [21]:
batch_size = 32  # Adjust based on GPU memory and model size
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)