## Imports

In [None]:
import os
import struct
import collections
import csv
import numpy as np
import pandas as pd
import nltk
from tensorflow.core.example import example_pb2

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
project_dir = "/content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project"
data_dir = os.path.join(project_dir, "data") # This contains raw, interim, preprocessed and external folders
raw_dir = os.path.join(data_dir, 'raw') # Raw data directory containing the train.csv, validation.csv, test.csv
interim_dir = os.path.join(data_dir, "interim") # This contains cnn/stories and dailymail/stories, and will contain the tokenized stories
preprocessed_dir = os.path.join(data_dir, "preprocessed") # It will contain finished_files folder
external_dir = os.path.join(data_dir, "external") # This contains stanford corenlp module and url_lists directory

## Define paths and other variables needed

In [None]:
dm_single_close_quote = u'\u2019' # unicode
dm_double_close_quote = u'\u201d'

END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote]

train_file = os.path.join(raw_dir, "train.csv")
val_file = os.path.join(raw_dir, "validation.csv")
test_file = os.path.join(raw_dir, "test.csv")

# Tokenized csv files
tokenized_train_path = os.path.join(interim_dir, 'tokenized_train.csv')
tokenized_val_path = os.path.join(interim_dir, 'tokenized_val.csv')
tokenized_test_path = os.path.join(interim_dir, 'tokenized_test.csv')

# We use these to separate the summary sentences in the .bin datafiles
SENTENCE_START = '[START]'
SENTENCE_END = '[END]'

expected_train_rows = 287113
expected_val_rows = 13368
expected_test_rows = 11490

finished_dir = os.path.join(preprocessed_dir, "finished_files")
chunks_dir = os.path.join(finished_dir, "chunks")

# No of vocabolaries to store
VOCAB_SIZE = 200000

# num examples per chunk, for the chunked data
CHUNK_SIZE = 1000

In [None]:
df = pd.read_csv(train_file)

In [None]:
df.shape

(287113, 3)

In [None]:
small_csv = df.head()

In [None]:
small_csv

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [None]:
small_csv.iloc[:, 1:]

Unnamed: 0,article,highlights
0,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [None]:
small_csv.iloc[:, 1:].apply(lambda x: x.shape, axis=1)

0    (2,)
1    (2,)
2    (2,)
3    (2,)
4    (2,)
dtype: object

In [None]:
small_csv.iloc[:, 1:].apply(lambda x: (nltk.tokenize.word_tokenize(x[0]), nltk.tokenize.word_tokenize(x[1])), axis=1)

0    ([By, ., Associated, Press, ., PUBLISHED, :, ....
1    ([(, CNN, ), --, Ralph, Mata, was, an, interna...
2    ([A, drunk, driver, who, killed, a, young, wom...
3    ([(, CNN, ), --, With, a, breezy, sweep, of, h...
4    ([Fleetwood, are, the, only, team, still, to, ...
dtype: object

In [None]:
def tokenize_row(x):
  word_tokenizer = nltk.tokenize.word_tokenize

  tok_article = word_tokenizer(x[0])
  tok_highlight = word_tokenizer(x[1])

  tok_article = ' '.join(tok_article)
  tok_highlight = ' '.join(tok_highlight)

  return tok_article, tok_highlight

small_csv.iloc[:, 1:].apply(tokenize_row, axis=1, result_type='broadcast')

Unnamed: 0,article,highlights
0,By . Associated Press . PUBLISHED : . 14:11 ES...,"Bishop John Folda , of North Dakota , is takin..."
1,( CNN ) -- Ralph Mata was an internal affairs ...,Criminal complaint : Cop used his role to help...
2,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd , 27 , had drunk at least..."
3,( CNN ) -- With a breezy sweep of his pen Pres...,Nina dos Santos says Europe must be ready to a...
4,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


## Function to find how many stories we have for train, test and validation

In [None]:
def check_num_stories(csv_path, expected_rows):
  df = pd.read_csv(csv_path)
  num_stories = df.shape[0]
  if num_stories != expected_rows:
    raise Exception("The csv file %s contains %i rows/stories but should contain %i" % (csv_path, num_stories, expected_rows))

## Function to tokenize the stories

In [None]:
def tokenize_stories(csv_file_path, tokenized_csv_file_path):
  data = pd.read_csv(csv_file_path)
  stories = data.iloc[:, 1:]

  del data

  word_tokenizer = nltk.tokenize.word_tokenize

  def tokenize_row(x):
    tok_article = word_tokenizer(x[0])
    tok_highlight = word_tokenizer(x[1])

    tok_article = ' '.join(tok_article)
    tok_highlight = ' '.join(tok_highlight)

    return tok_article, tok_highlight

  tokenized_stories = stories.apply(tokenize_row, axis=1, result_type='broadcast')
  tokenized_stories.to_csv(tokenized_csv_file_path)

  # Check whether we tokenized all stories or not, if not raise Exception
  num_orig = pd.read_csv(csv_file_path).shape[0]
  num_tokenized = pd.read_csv(tokenized_csv_file_path).shape[0]

  if num_orig != num_tokenized:
    raise Exception("The tokenized stories csv %s contains %i \
    files, but it should contain the same number as %s (which has %i files). Was \
    there an error during tokenization?" % (tokenized_csv_file_path, num_tokenized, csv_file_path, num_orig))

  # If no problem occurs
  print("Successfully finished tokenizing %s to %s.\n" % (csv_file_path, tokenized_csv_file_path))

## Function to process the articles and highlights properly for the model

In [None]:
small_csv

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [None]:
tok_small_csv = small_csv.iloc[:, 1:].apply(tokenize_row, axis=1, result_type='broadcast')
tok_small_csv

Unnamed: 0,article,highlights
0,By . Associated Press . PUBLISHED : . 14:11 ES...,"Bishop John Folda , of North Dakota , is takin..."
1,( CNN ) -- Ralph Mata was an internal affairs ...,Criminal complaint : Cop used his role to help...
2,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd , 27 , had drunk at least..."
3,( CNN ) -- With a breezy sweep of his pen Pres...,Nina dos Santos says Europe must be ready to a...
4,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [None]:
stories = tok_small_csv.to_numpy()
stories.shape

(5, 2)

In [None]:
articles, abstracts = stories[:, 0], stories[:, 1]
articles, abstracts

In [None]:
articles = [article.lower() for article in articles]
articles

In [None]:
abstracts[0]

'Bishop John Folda , of North Dakota , is taking time off after being diagnosed . He contracted the infection through contaminated food in Italy . Church members in Fargo , Grand Forks and Jamestown could have been exposed .'

In [None]:
def to_article_abstract(tokenized_story):
  article, highlight = tokenized_story[0], tokenized_story[1]

  article = article.lower()
  abstract = highlight.lower()

  return article, abstract

## Function to write the article and abstract of each story in binary form and save them

In [None]:
for i in small_csv.index:
  print(i)

0
1
2
3
4


In [None]:
def write_to_bin(tokenized_csv_file_path, out_file, makevocab=False):
  print("Making bin file for stories listed in %s..." % tokenized_csv_file_path)

  stories = pd.read_csv(tokenized_csv_file_path)

  num_stories = len(stories)

  if makevocab:
    vocab_counter = collections.Counter()

  with open(out_file, 'wb') as writer:
    for idx in stories.index:
      # For every 1000 successful writing
      if idx % 1000 == 0:
        print("Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories)))

      story = stories['article'][idx], stories['highlights'][idx]

      article, abstract = to_article_abstract(story)

      tf_example = example_pb2.Example()
      tf_example.features.feature['article'].bytes_list.value.extend([bytes(article, 'utf-8')])
      tf_example.features.feature['abstract'].bytes_list.value.extend([bytes(abstract, 'utf-8')])

      tf_example_str = tf_example.SerializeToString()
      str_len = len(tf_example_str)

      writer.write(struct.pack('q', str_len))
      writer.write(struct.pack('%ds'%str_len, tf_example_str))

      if makevocab:
        art_tokens = article.split(' ')
        abs_tokens = abstract.split(' ')
        abs_tokens = [t for t in abs_tokens if t not in [SENTENCE_START, SENTENCE_END]]

        tokens = art_tokens + abs_tokens

        tokens = [t.strip() for t in tokens]
        tokens = [t for t in tokens if t!=""]

        vocab_counter.update(tokens)

  print("Finished writing file %s\n" % out_file)

  if makevocab:
    with open(os.path.join(finished_dir, "vocab"), 'w') as writer:
      print("Writing vocab file...")

      for word, count in vocab_counter.most_common(VOCAB_SIZE):
        writer.write(word + ' ' + str(count) + '\n')

    print("Finished writing vocab file")

## Function to break the .bin files into chunks

In [None]:
def chunk_file(set_name):
  in_file = os.path.join(finished_dir, set_name+".bin")

  reader = open(in_file, 'rb')

  finished = False
  chunk = 0

  while not finished:
    chunk_file = os.path.join(chunks_dir, "%s-%03d.bin" % (set_name, chunk))

    with open(chunk_file, 'wb') as writer:
      for _ in range(CHUNK_SIZE):
        len_bytes = reader.read(8)

        if not len_bytes:
          finished = True
          break

        str_len = struct.unpack('q', len_bytes)[0]
        example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]

        writer.write(struct.pack('q', str_len))
        writer.write(struct.pack('%ds' % str_len, example_str))

      chunk += 1


def chunk_all():
  if not os.path.isdir(os.path.join(finished_dir, "chunks")):
    os.makedirs(os.path.join(finished_dir, "chunks"))

  set_names = ["train", "val", "test"]

  for set_name in set_names:
    print("Splitting %s data into chunks..." % set_name)
    chunk_file(set_name)

  print("Saved chunked data in %s..." % chunks_dir)

## Write to a csv file for train, val and test

In [None]:
def write_to_csv(tokenized_csv_file_path, out_csv, makevocab=False):
  print("Making csv file for stories listed in %s..." % tokenized_csv_file_path)

  stories = pd.read_csv(tokenized_csv_file_path)

  num_stories = len(stories)

  if makevocab:
    vocab_counter = collections.Counter()

  with open(out_csv, 'w') as csvfile:
    # Creating the csv writer
    csvwriter = csv.writer(csvfile)
    fields = ['article', 'highlights']

    # Write the headers
    csvwriter.writerow(fields)

    for idx in stories.index:
      # For every 1000 successful writing
      if idx % 1000 == 0:
        print("Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories)))

      story = stories['article'][idx], stories['highlights'][idx]

      article, abstract = to_article_abstract(story)

      csvwriter.writerow([article, abstract])

      if makevocab:
        art_tokens = article.split(' ')
        abs_tokens = abstract.split(' ')
        abs_tokens = [t for t in abs_tokens if t not in [SENTENCE_START, SENTENCE_END]]

        tokens = art_tokens + abs_tokens

        tokens = [t.strip() for t in tokens]
        tokens = [t for t in tokens if t!=""]

        vocab_counter.update(tokens)

  print("Finished writing file %s\n" % out_csv)

  if makevocab:
    with open(os.path.join(finished_dir, "vocab"), 'w') as writer:
      print("Writing vocab file...")

      for word, count in vocab_counter.most_common(VOCAB_SIZE):
        writer.write(word + ' ' + str(count) + '\n')

    print("Finished writing vocab file")

## Tokenize stories, convert them to binary and save it, Make chunks

In [None]:
# Check the stories directories contain the correct number of .story files
check_num_stories(train_file, expected_train_rows)
check_num_stories(val_file, expected_val_rows)
check_num_stories(test_file, expected_test_rows)

# Create some new directories
if not os.path.exists(finished_dir): os.makedirs(finished_dir)

# Run stanford tokenizer on both stories dirs, outputting to tokenized stories directories
tokenize_stories(train_file, tokenized_train_path)
tokenize_stories(val_file, tokenized_val_path)
tokenize_stories(test_file, tokenized_test_path)

# Read the tokenized stories, do a little postprocessing then write to bin files
write_to_bin(tokenized_train_path, os.path.join(finished_dir, "train.bin"), makevocab=True)
write_to_bin(tokenized_val_path, os.path.join(finished_dir, "val.bin"))
write_to_bin(tokenized_test_path, os.path.join(finished_dir, "test.bin"))

chunk_all()

Successfully finished tokenizing /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/raw/train.csv to /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/interim/tokenized_train.csv.

Successfully finished tokenizing /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/raw/validation.csv to /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/interim/tokenized_val.csv.

Successfully finished tokenizing /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/raw/test.csv to /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/interim/tokenized_test.csv.

Making bin file for stories listed in /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/raw/train.csv...
Writing story 0 of 287113; 0.00 percent done
Writing story 1000 of 287113; 0.35 percent done
Writing story 2000 of 287113; 0.70 percent done
Writing story 3000 of 287113; 1.04 per

In [None]:
# Check the stories directories contain the correct number of .story files
check_num_stories(train_file, expected_train_rows)
check_num_stories(val_file, expected_val_rows)
check_num_stories(test_file, expected_test_rows)

# Create some new directories
if not os.path.exists(finished_dir): os.makedirs(finished_dir)

# Run stanford tokenizer on both stories dirs, outputting to tokenized stories directories
tokenize_stories(train_file, tokenized_train_path)
tokenize_stories(val_file, tokenized_val_path)
tokenize_stories(test_file, tokenized_test_path)

# Read the tokenized stories, do a little postprocessing then write to bin files
write_to_csv(tokenized_train_path, os.path.join(finished_dir, "final_train.csv"), makevocab=True)
write_to_csv(tokenized_val_path, os.path.join(finished_dir, "final_val.csv"))
write_to_csv(tokenized_test_path, os.path.join(finished_dir, "final_test.csv"))

Successfully finished tokenizing /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/raw/train.csv to /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/interim/tokenized_train.csv.

Successfully finished tokenizing /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/raw/validation.csv to /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/interim/tokenized_val.csv.

Successfully finished tokenizing /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/raw/test.csv to /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/interim/tokenized_test.csv.

Making csv file for stories listed in /content/drive/MyDrive/Projects/Suvidha-Foundation-Internship-Project/data/interim/tokenized_train.csv...
Writing story 0 of 287113; 0.00 percent done
Writing story 1000 of 287113; 0.35 percent done
Writing story 2000 of 287113; 0.70 percent done
Writing story 3000 of 28