In [None]:
%tensorflow_version 2.x

In [None]:
import tarfile, os, string, re, nltk, joblib, shutil

import pandas as pd
import numpy as np
import tensorflow as tf

from google.colab import drive
from nltk.tokenize import sent_tokenize, TweetTokenizer
from collections import Counter

from tensorflow.keras.utils import get_file

In [None]:
def extract_gzip(file_path, output_path):
  with tarfile.open(file_path, "r:gz") as f: f.extractall(output_path)

def replace_whitespace_with_space(text):
  return re.sub('\s', ' ', text)

def remove_non_printable(text):
  re_printable = re.compile('[^%s]' % re.escape(string.printable))

  return re_printable.sub('', text)

def collapse_continuous_space(text):
  return re.sub('\s+', ' ', text)

def preprocess_text (text):
  text = replace_whitespace_with_space(text)

  text = remove_non_printable(text)

  text = text.strip().lower()

  text = collapse_continuous_space(text)

  word_tokenizer = TweetTokenizer()

  tokens = [ word for sent in sent_tokenize(text) for word in word_tokenizer.tokenize(sent) ]

  text = ' '.join(tokens)

  return collapse_continuous_space(text)

def replace_tokens_from_text(text, tokens_to_replace, informative_tokens):
  for token in sorted(tokens_to_replace, key=len, reverse=True):
    text = text.replace(token, informative_tokens[token])
  return text

def remove_tokens_from_text(text, tokens_to_remove):
  for token in sorted(tokens_to_remove, key=len, reverse=True):
    text = text.replace(token, '')
  return text

def extract_unique_emails(text):
  email_regex = r"(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"
  return set(re.findall(email_regex, text, flags=re.IGNORECASE))

def extract_unique_urls(text):
  url_regex = r"\b((?:[a-z][\w-]+://|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|uk)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|uk)\b/?(?!@)))"
  return set(re.findall(url_regex, text, flags=re.IGNORECASE))

def get_dictionary_of_extracted_informative_tokens(informative_tokens_structure):
  result = {}
  for key in informative_tokens_structure:
    result.update({ f'<TOKEN_{key}_{i}>' : token for i, token in enumerate(informative_tokens_structure[key]) })
  return dict(sorted(result.items(), key = lambda o: len (o[1]), reverse=True))

def update_data_structures_from_files (files_path, word_counter, informative_tokens_structure, encoding):
  for file_name in os.listdir(files_path):
    file_contents = load_text_file(os.path.join(files_path, file_name), encoding)

    urls = extract_unique_urls(file_contents)

    informative_tokens_structure['URL'].update(urls)

    file_contents_cleared1 = remove_tokens_from_text(file_contents, urls)

    emails = extract_unique_emails(file_contents_cleared1)

    informative_tokens_structure['EMAIL'].update(emails)

    file_contents_cleared2 = remove_tokens_from_text(file_contents_cleared1, emails)

    processed_content = preprocess_text(file_contents_cleared2)

    tokens = processed_content.split()

    word_counter.update(tokens)

    del file_contents, urls, file_contents_cleared1, emails, file_contents_cleared2, processed_content, tokens

def build_word_vocabulary (word_counter):
  word2index = {}

  for word, _ in word_counter:
    word2index[word] = len(word2index)

  return word2index

def build_reversed_dictionary (word2index):
  return dict(zip(word2index.values(), word2index.keys()))

def add_special_tokens_to_word_vocabulary (word_vocabulary, special_tokens):
  for token in special_tokens: word_vocabulary[token] = len(word_vocabulary)

def encode_files (files, encoding, word_vocabulary, informative_tokens, special_tokens, maximum_length, padding_type, truncating_type, add_eot, add_sot):
  return [ encode_file(f, encoding, word_vocabulary, informative_tokens, special_tokens, maximum_length, padding_type, truncating_type, add_eot, add_sot) for f in files ]

def encode_file (f, encoding, word_vocabulary, informative_tokens, special_tokens, maximum_length, padding_type, truncating_type, add_eot, add_sot):
  informative_tokens_reversed = build_reversed_dictionary(informative_tokens)

  file_contents = load_text_file(f, encoding)

  urls = extract_unique_urls(file_contents)

  file_contents_changed1 = replace_tokens_from_text(file_contents, urls, informative_tokens_reversed)

  emails = extract_unique_emails(file_contents_changed1)

  file_contents_changed2 = replace_tokens_from_text(file_contents_changed1, emails, informative_tokens_reversed)

  file_processed = preprocess_text(file_contents_changed2)

  file_sequence = text_to_sequence (file_processed, word_vocabulary, special_tokens['oov'])

  file_padded = pad_sequence (file_sequence, word_vocabulary, special_tokens, maximum_length, padding_type, truncating_type, add_eot, add_sot)

  del file_contents, urls, file_contents_changed1, emails, file_contents_changed2, file_processed, file_sequence

  return file_padded

def get_file_path_pairs_of_documents_and_summaries (documents_path, summaries_path):
  documents_files = os.listdir(documents_path)
  summaries_files = os.listdir(summaries_path)

  examples_files = []

  for document_file_name in documents_files:
    file_name_prefix = os.path.splitext(document_file_name) [0] + '_'
    for summary_file_name in summaries_files:
      if summary_file_name[:len(file_name_prefix)] == file_name_prefix:
        examples_files.append((os.path.join(documents_path, document_file_name), os.path.join(summaries_path, summary_file_name)))

  documents_files, summaries_files = list(zip(*examples_files))

  return documents_files, summaries_files

def get_file_paths (files_path):
  file_paths = []

  for file_name in os.listdir(files_path):
    file_paths.append(os.path.join(files_path, file_name))

  return file_paths

def texts_to_sequences (texts, word_vocabulary, oov_token_key):
  return [ text_to_sequence(text, word_vocabulary, oov_token_key) for text in texts ]

def text_to_sequence(text, word_vocabulary, oov_token_key):
  sequence = []

  for token in text.split():
    if token in word_vocabulary:
      sequence.append(word_vocabulary[token])
    else:
      sequence.append(word_vocabulary[oov_token_key])

  return sequence

def load_text_file(file_path, encoding):
  with open(file_path, 'r', encoding=encoding) as f: return f.read()

def pad_sequences (sequences, word_vocabulary, special_tokens, maximum_sequence_length, padding_type, truncating_type, add_eot=False, add_sot=False):
  return [ pad_sequence(sequence, word_vocabulary, special_tokens, maximum_sequence_length, padding_type, truncating_type, add_eot, add_sot) for sequence in sequences ]

def pad_sequence (sequence, word_vocabulary, special_tokens, maximum_sequence_length, padding_type, truncating_type, add_eot=False, add_sot=False):
  special_tokens_count = 0

  if add_eot: special_tokens_count += 1
  if add_sot: special_tokens_count += 1

  rest_tokens_count = maximum_sequence_length - special_tokens_count

  if truncating_type == 'pre':
    new_sequence = sequence[-rest_tokens_count:]
  elif truncating_type == 'post':
    new_sequence = sequence[:rest_tokens_count]
  else:
    raise ValueError(f'Truncating type "{truncating_type}" not understood!')

  truncated_sequence_length = len(new_sequence)

  if add_sot: new_sequence = [word_vocabulary[special_tokens['sot']]] + new_sequence

  if add_eot: new_sequence = new_sequence + [word_vocabulary[special_tokens['eot']]]

  pad_tokens_count = rest_tokens_count - truncated_sequence_length

  if padding_type == 'post':
    new_sequence = new_sequence + pad_tokens_count * [word_vocabulary[special_tokens['pad']]]
  elif padding_type == 'pre':
    new_sequence = pad_tokens_count * [word_vocabulary[special_tokens['pad']]] + new_sequence
  else:
    raise ValueError(f'Padding type "{padding_type}" not understood!')

  return new_sequence

def load_pretrained_embeddings(file_path, encoding):
    embeddings = {}

    with open(file_path, 'r', encoding=encoding) as f:
        n, d = map(int, f.readline().strip().split())

        for line in f:
            values = line.strip().split()
            embeddings[values[0]] = np.asarray(values[1:], dtype=np.float32)

    return embeddings, n, d

def create_embedding_matrix(embeddings, word_vocabulary, embedding_size):
    vocab_size = len(word_vocabulary)

    embedding_matrix = np.random.uniform(-1, 1, size=(vocab_size, embedding_size))

    n_loaded = 0

    for w, i in word_vocabulary.items():
        v = embeddings.get(w)
        if v is not None:
            embedding_matrix[i] = v

            n_loaded += 1

    return n_loaded, embedding_matrix.astype(np.float32)

In [None]:
# SETTINGS
MOUNT_DIRECTORY_PATH = os.path.join(os.getcwd(), 'drive')

IO_DIRECTORY_PATH = os.path.join(MOUNT_DIRECTORY_PATH, 'My Drive', 'FNS-2020')

DATASET_FILE_PATH = os.path.join(IO_DIRECTORY_PATH, 'datasets', 'large-dataset.tar.gz')

EMBEDDINGS_FILE_PATH = os.path.join(IO_DIRECTORY_PATH, 'embeddings', 'glove-6B-100d.tar.gz')

TEXT_ENCODING = 'UTF-8'

SHUFFLE_BUFFER_SIZE = 10000

RANDOM_SEED = 42

SPECIAL_TOKENS = {
  'oov' : '<TOKEN_OUT_OF_VOCABULARY>',
  'pad' : '<TOKEN_PAD>',
  'sot' : '<TOKEN_START_OF_TEXT>',
  'eot' : '<TOKEN_STOP_OF_TEXT>'
}

In [None]:
# HYPERPARAMETERS
VOCAB_NUMBER_OF_WORDS = 10000

BATCH_SIZE = 64

DOCUMENTS_MAXIMUM_SEQUENCE_LENGTH = 3000
SUMMARIES_MAXIMUM_SEQUENCE_LENGTH = 1000

PADDING_TYPE = 'post'
TRUNCATING_TYPE = 'post'

In [None]:
np.random.seed(RANDOM_SEED)

tf.random.set_seed(RANDOM_SEED)

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
drive.mount(MOUNT_DIRECTORY_PATH, force_remount=True)

Mounted at /content/drive


In [None]:
extracted_dataset_directory_path = os.path.join(IO_DIRECTORY_PATH, 'dataset')

if os.path.exists(extracted_dataset_directory_path): shutil.rmtree(extracted_dataset_directory_path)

extract_gzip(DATASET_FILE_PATH, extracted_dataset_directory_path)

In [None]:
training_documents_path = os.path.join(extracted_dataset_directory_path, 'training', 'documents')
training_summaries_path = os.path.join(extracted_dataset_directory_path, 'training', 'summaries')
validation_documents_path = os.path.join(extracted_dataset_directory_path, 'validation', 'documents')
validation_summaries_path = os.path.join(extracted_dataset_directory_path, 'validation', 'summaries')
testing_documents_path = os.path.join(extracted_dataset_directory_path, 'testing', 'documents')

In [None]:
informative_tokens_structure = { 'URL' : set(), 'EMAIL' : set() }
word_counter = Counter()

update_data_structures_from_files(training_documents_path, word_counter, informative_tokens_structure, TEXT_ENCODING)
update_data_structures_from_files(training_summaries_path, word_counter, informative_tokens_structure, TEXT_ENCODING)
update_data_structures_from_files(validation_documents_path, word_counter, informative_tokens_structure, TEXT_ENCODING)
update_data_structures_from_files(validation_summaries_path, word_counter, informative_tokens_structure, TEXT_ENCODING)
update_data_structures_from_files(testing_documents_path, word_counter, informative_tokens_structure, TEXT_ENCODING)

joblib.dump(word_counter, os.path.join(IO_DIRECTORY_PATH, 'word_counter.pkl'))
joblib.dump(informative_tokens_structure, os.path.join(IO_DIRECTORY_PATH, 'informative_tokens_structure.pkl'))

['/content/drive/My Drive/FNS-2020/informative_tokens_structure.pkl']

In [None]:
word_counter = joblib.load(os.path.join(IO_DIRECTORY_PATH, 'word_counter.pkl'))

sorted_word_counter = word_counter.most_common()

pd.DataFrame(sorted_word_counter, columns=['word', 'count']).to_csv(os.path.join(IO_DIRECTORY_PATH, 'words_counted.csv'), index=False)

full_word_vocabulary = build_word_vocabulary (sorted_word_counter)

pd.DataFrame(full_word_vocabulary.items(), columns=['word', 'index']).to_csv(os.path.join(IO_DIRECTORY_PATH, 'full_word_vocabulary.csv'), index=False)

word2index = build_word_vocabulary (word_counter.most_common(VOCAB_NUMBER_OF_WORDS))

add_special_tokens_to_word_vocabulary (word2index, SPECIAL_TOKENS.values())

pd.DataFrame(word2index.items(), columns=['word', 'index']).to_csv(os.path.join(IO_DIRECTORY_PATH, 'subset_word_vocabulary.csv'), index=False)

index2word = build_reversed_dictionary (word2index)

informative_tokens_structure = joblib.load(os.path.join(IO_DIRECTORY_PATH, 'informative_tokens_structure.pkl'))

informative_tokens_extracted = get_dictionary_of_extracted_informative_tokens(informative_tokens_structure)

pd.DataFrame(informative_tokens_extracted.items(), columns=['token_key', 'token_value']).to_csv(os.path.join(IO_DIRECTORY_PATH, 'informative_tokens_extracted.csv'), index=False)

In [None]:
training_documents_files, training_summaries_files = get_file_path_pairs_of_documents_and_summaries (training_documents_path, training_summaries_path)
validation_documents_files, validation_summaries_files = get_file_path_pairs_of_documents_and_summaries (validation_documents_path, validation_summaries_path)
testing_documents_files = get_file_paths (testing_documents_path)

training_documents_sequences = encode_files(training_documents_files, TEXT_ENCODING, word2index, informative_tokens_extracted, SPECIAL_TOKENS, DOCUMENTS_MAXIMUM_SEQUENCE_LENGTH, PADDING_TYPE, TRUNCATING_TYPE, add_eot=True, add_sot=True)
training_summaries_sequences = encode_files(training_summaries_files, TEXT_ENCODING, word2index, informative_tokens_extracted, SPECIAL_TOKENS, SUMMARIES_MAXIMUM_SEQUENCE_LENGTH, PADDING_TYPE, TRUNCATING_TYPE, add_eot=True, add_sot=True)
validation_documents_sequences = encode_files(validation_documents_files, TEXT_ENCODING, word2index, informative_tokens_extracted, SPECIAL_TOKENS, DOCUMENTS_MAXIMUM_SEQUENCE_LENGTH, PADDING_TYPE, TRUNCATING_TYPE, add_eot=True, add_sot=True)
validation_summaries_sequences = encode_files(validation_summaries_files, TEXT_ENCODING, word2index, informative_tokens_extracted, SPECIAL_TOKENS, SUMMARIES_MAXIMUM_SEQUENCE_LENGTH, PADDING_TYPE, TRUNCATING_TYPE, add_eot=True, add_sot=True)
testing_documents_sequences = encode_files (testing_documents_files, TEXT_ENCODING, word2index, informative_tokens_extracted, SPECIAL_TOKENS, DOCUMENTS_MAXIMUM_SEQUENCE_LENGTH, PADDING_TYPE, TRUNCATING_TYPE, add_eot=True, add_sot=True)

joblib.dump(training_documents_sequences, os.path.join(IO_DIRECTORY_PATH, 'training_documents_encoded.pkl')) 
joblib.dump(training_summaries_sequences, os.path.join(IO_DIRECTORY_PATH, 'training_summaries_encoded.pkl')) 
joblib.dump(validation_documents_sequences, os.path.join(IO_DIRECTORY_PATH, 'validation_documents_encoded.pkl')) 
joblib.dump(validation_summaries_sequences, os.path.join(IO_DIRECTORY_PATH, 'validation_summaries_encoded.pkl')) 
joblib.dump(testing_documents_sequences, os.path.join(IO_DIRECTORY_PATH, 'testing_documents_encoded.pkl'))

['/content/drive/My Drive/FNS-2020/testing_documents_encoded.pkl']

In [None]:
training_documents_sequences = joblib.load(os.path.join(IO_DIRECTORY_PATH, 'training_documents_encoded.pkl')) 
training_summaries_sequences = joblib.load(os.path.join(IO_DIRECTORY_PATH, 'training_summaries_encoded.pkl'))
validation_documents_sequences = joblib.load(os.path.join(IO_DIRECTORY_PATH, 'validation_documents_encoded.pkl')) 
validation_summaries_sequences = joblib.load(os.path.join(IO_DIRECTORY_PATH, 'validation_summaries_encoded.pkl'))
testing_documents_sequences = joblib.load(os.path.join(IO_DIRECTORY_PATH, 'testing_documents_encoded.pkl')) 

print(f'Number of training documents: {len(training_documents_sequences)}')
print(f'Number of training summaries: {len(training_summaries_sequences)}')
print(f'Number of validation documents: {len(validation_documents_sequences)}')
print(f'Number of validation summaries: {len(validation_summaries_sequences)}')
print(f'Number of testing documents: {len(testing_documents_sequences)}')

print('Examples of encoded training documents')
print(pd.DataFrame(training_documents_sequences).head())
print('Examples of encoded training summaries')
print(pd.DataFrame(training_summaries_sequences).head())

print(f'Vocabulary (word2index): {word2index}')
print(f'Vocabulary (index2word): {index2word}')
print(f'Vocabulary size: {len(word2index)}')

Number of training documents: 9873
Number of training summaries: 9873
Number of validation documents: 1250
Number of validation summaries: 1250
Number of testing documents: 500
Examples of encoded training documents
    0      1     2     3     4     5     ...  2994  2995  2996  2997  2998   2999
0  10002    833   528   139    64    59  ...  1921  4650   508   901   421  10003
1  10002    833   528   139    64    59  ...  1921  4650   508   901   421  10003
2  10002    833   528   139    64    59  ...  1921  4650   508   901   421  10003
3  10002    833   528   139    64    59  ...  1921  4650   508   901   421  10003
4  10002  10000   574   119   508    52  ...     2    14    15    47    72  10003

[5 rows x 3000 columns]
Examples of encoded training summaries
     0      1      2    3    4    5    ...    994    995    996    997    998    999
0  10002   1229  10000  574  119  508  ...   1010      2      6      0    399  10003
1  10002   1597  10000  574  119  508  ...  10001  10001  

In [None]:
training_dataset = tf.data.Dataset.from_tensor_slices((training_documents_sequences, training_summaries_sequences)).shuffle(SHUFFLE_BUFFER_SIZE)
validation_dataset = tf.data.Dataset.from_tensor_slices((validation_documents_sequences, validation_summaries_sequences)).shuffle(SHUFFLE_BUFFER_SIZE)
testing_dataset = tf.data.Dataset.from_tensor_slices((testing_documents_sequences)).shuffle(SHUFFLE_BUFFER_SIZE)

training_dataset = training_dataset.batch(BATCH_SIZE)
validation_dataset = validation_dataset.batch(BATCH_SIZE)
testing_dataset = testing_dataset.batch(BATCH_SIZE)

In [None]:
extracted_embeddings_file_path = os.path.join(IO_DIRECTORY_PATH, 'embeddings.vec')

if os.path.exists(extracted_embeddings_file_path): os.remove(extracted_embeddings_file_path)

extract_gzip(EMBEDDINGS_FILE_PATH, IO_DIRECTORY_PATH)

word_embeddings, total_embeddings, embeddings_dims = load_pretrained_embeddings(extracted_embeddings_file_path, TEXT_ENCODING)

n_embeddings_loaded, embedding_matrix = create_embedding_matrix(word_embeddings, word2index, embeddings_dims)

print(f'Successfully loaded pretrained embeddings for {n_embeddings_loaded} / {VOCAB_NUMBER_OF_WORDS} words.')

Successfully loaded pretrained embeddings for 9636 / 10000 words.


In [None]:
tf.keras.backend.clear_session()