In [1]:
import os, sys, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import torch
import torchtext
import torch.nn as nn
import torchtext.data as ttd
from torchtext.vocab import GloVe

from collections import OrderedDict

from datetime import datetime

In [117]:
# loading stop words
nltk.load('english', format='text');
nltk.download('punkt');

[nltk_data] Downloading package punkt to /Users/baraa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# some configuration
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 3_000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 2_000
LATENT_DIM = 25

SOS = "<sos>"
EOS = "<eos>"

In [3]:
# data path
robert_frost = '../../Lazyprogrammer/seq2seq/robert_frost.txt'

In [4]:
# load in the data 
lines = []
for line in open(robert_frost):
    line = line.lower().rstrip()
    if not line:
        continue
    lines.append(line)
        


print("number of lines :", len(lines))

number of lines : 1436


In [5]:
# looking at some random lines
idx = np.random.randint(len(lines))
print(lines[idx])

and set off briskly for so slow a thing,


In [6]:
# build vocab
voc = []
l = []
#stop_words = stopwords.words("english")

for line in open(robert_frost):
    line = word_tokenize(re.sub("\W+", " ", line.lower())) 
    for w in line:
        if w not in voc: # and w not in stop_words:
            voc.append(w)

voc = voc + [EOS, SOS]

In [7]:
# convert vocabulary to indices and keep the order (OrderedDict)
word2idx = {}
for idx, w in enumerate(reversed(voc)):
    word2idx[w] = idx + 1 # key=word, item=index

word2idx_keys = word2idx.keys()
print("Length of vocab : {0:d} tokens".format(len(word2idx)))

Length of vocab : 2120 tokens


In [8]:
# tokenizing inputs and targets
input_sequences = []
target_sequences = []
for line in lines:
    line = word_tokenize(re.sub("\W+", " ", line.lower())) 
    input_sequence = [word2idx[SOS]] + [word2idx[w] for w in line]
    target_sequence = [word2idx[w] for w in line] + [word2idx[EOS]]
    input_sequences.append(input_sequence)
    target_sequences.append(target_sequence)
    


In [9]:
max_sequence_length = max(len(s) for s in input_sequences)
print("max_sequence_length:", max_sequence_length)

max_sequence_length: 15


In [10]:
# post padding
k = 0
for i in range(len(input_sequences)):
    if len(input_sequences[i]) < max_sequence_length:
        input_sequences[i] += (max_sequence_length - len(input_sequences[i])) * [0]
        target_sequences[i] += (max_sequence_length - len(target_sequences[i])) * [0]

In [12]:
# shape of data
len(input_sequences), len(input_sequences[0])

(1436, 15)

In [13]:
# load in pre-trained word vectors 
# can download here 
# word2vec = torchtext.vocab.GloVe(name="6B", dim=EMBEDDING_DIM) 
print("loading word vectors ...")
word2vec_path = '../../Lazyprogrammer/large_files/glove.6B/glove.6B.%sd.txt'
word2vec = {}
with open(
    os.path.join(word2vec_path % EMBEDDING_DIM)
) as f:
    # is just a space-separated text file in the format:
    # word vec[0] vec[1] vec[2]
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.array(values[1:], dtype="float32")
        word2vec[word] = vec
    print("Found %s word vectors." % len(word2vec))

loading word vectors ...
Found 400000 word vectors.


In [14]:
# prepare embedding matrix
print("Filling pre-trained embeddings...")
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
    if i < num_words:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all zeros
            embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


In [15]:
# one-hot the targets (can't use sparse categorical cross entropy)
one_hot_targets = np.zeros((len(input_sequences), max_sequence_length, num_words))
for i, target_sequence in enumerate(target_sequences):
    for t, word in enumerate(target_sequence):
        if word > 0:
            one_hot_targets[i, t, word] = 1

In [27]:
# load pre-trained word embeddings into an embedding layer
# freeze the layer
embedding_layer = nn.Embedding(num_words, EMBEDDING_DIM,)  # vocab size  # embedding dim
embedding_layer.weight = nn.Parameter(torch.from_numpy(embedding_matrix).float())
embedding_layer.requires_grad = False
