In [4]:
import numpy as np
import matplotlib.pyplot as plt

In [5]:
# Data taken from https://www.kaggle.com/datasets/notlucasp/financial-news-headlines?select=guardian_headlines.csv

In [6]:
import csv

sentences = []
with open('./guardian_headlines.csv', 'r') as f:
    csvreader = csv.reader(f)
    for row in csvreader:
      words = row[1].split()
      sentence = [word.lower() for word in words]
      sentences.append(sentence)

sentences.pop(0) # removing heading of list

['headlines']

In [7]:
print("Number of sentences:", len(sentences))
vocab = set([word for sentence in sentences for word in sentence])
print("Size of vocabulary:", len(vocab))

Number of sentences: 17800
Size of vocabulary: 23491


In [8]:
sentence_lengths = [len(sentence) for sentence in sentences]
print("Min length", np.min(sentence_lengths))
print("Average length", np.mean(sentence_lengths))
print("Max length", np.max(sentence_lengths))

Min length 3
Average length 11.021348314606742
Max length 30


In [9]:
from random import shuffle

shuffle(sentences)

train_data = sentences[:int(0.8 * len(sentences))]
val_data = sentences[int(0.8 * len(sentences)):int(0.9 * len(sentences))]
test_data = sentences[int(0.9 * len(sentences)):]

print(len(train_data), len(val_data), len(test_data))

14240 1780 1780


In [10]:
from collections import Counter

word_count = Counter([w for sentence in sentences for w in sentence])
print(word_count.most_common(1))

[('to', 6220)]


In [11]:
# create vocab dictionary
vocab = set([w for sentence in sentences for w in sentence])
word_index = {w:i for i, w in enumerate(vocab)}

In [14]:
# convert words to indices

def words_to_indices(sentences):

  return [[word_index[word] for word in sentence] for sentence in sentences]

def generate_4grams(indices):

  four_grams = []
  for sentence in indices:
    for i in range(len(sentence) - 4):
      four_grams.append(sentence[i:i+4])

  return four_grams

def compile_data(sentences):
  indices = words_to_indices(sentences)
  four_grams = generate_4grams(indices)
  return np.array(four_grams)

grams = compile_data(train_data)
print(grams.shape)

(99950, 4)


In [15]:
# one hot encoding the input

def onehot_encoder(four_grams, vocab_size):
  I = np.eye(vocab_size)
  return I[four_grams]

print(onehot_encoder(grams, len(vocab)).shape)

(99950, 4, 23491)


In [24]:
def get_batch(data, range_min, range_max, onehot=True):

  X = data[range_min:range_max, :3]
  t = data[range_min:range_max, 3]

  X = onehot_encoder(X, len(vocab))

  X = X.reshape(-1, 3 * len(vocab))

  if onehot:
    t = onehot_encoder(t, len(vocab))
    t = t.reshape(-1, len(vocab))

  return X, t

# test the data out

grams = compile_data(train_data)
X, t = get_batch(grams, 0, 10, onehot=False)
print(X.shape, t.shape)

# check if indices are at the right position
for i in range(3):
  index_pos = grams[0][i]
  print(X[0][index_pos + ((len(vocab) * i) - 1)]) # should be 0
  print(X[0][index_pos + (len(vocab) * i)]) # should 1
  print(X[0][index_pos + ((len(vocab) * i) + 1)]) # should be 0

(10, 70473) (10,)
0.0
1.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
