# Word2Box: Exploratory Data Analysis and Preprocessing

Author: Bigarella Chiara

# 1 - Setup


In [None]:
! pip uninstall torchtext -y

! pip install torchtext==0.6.0

Found existing installation: torchtext 0.18.0
Uninstalling torchtext-0.18.0:
  Successfully uninstalled torchtext-0.18.0
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->torchtext==0.6.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->torchtext==0.6.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->torchtext==0.6.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->torchtext==0.6.0)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12=

In [None]:
import math
import re

from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torchtext

%matplotlib inline

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
COLORS_PALETTE = ["#d952a1", "#3C9BFA",  "#04C896",  "#FCB405",  "#7C5CFF", "#FC8204"]
sns.set_palette(sns.color_palette(COLORS_PALETTE))

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DRIVE_DATA_FOLDER ='/content/drive/MyDrive/Uni/word2box/data/'

PREPROCESSED_TEXT_PATH = '/content/enwik8-preprocessed.txt'

## Helper functions

In [None]:
# Exploratory Data Analysis

def get_text_stats(data_folder, filename):
  TEXT = torchtext.data.Field()

  data_splits = torchtext.datasets.LanguageModelingDataset.splits(
      path=data_folder,
      train=filename,
      validation=None,
      test=None,
      text_field=TEXT,
  )

  TEXT.build_vocab(data_splits[0]) # train split
  return TEXT

def average_word_length(df, count):
  '''Provide insights into the vocabulary’s complexity.'''
  return sum([len(wt) for wt in df["word_type"]]) / count

def weighted_average_word_length(df):
  '''Provide insights into the vocabulary’s complexity.'''
  return sum([len(df['word_type'][i])*df['freq'][i] for i in df.index]) / sum(df['freq'])

In [None]:
# Preprocessing

def preprocess(text):

  # lowercase
  # text = text.lower()

  # separate these punctuation marks from the words in the text and allow the model to treat them differently.
  text = text.replace("'''", ' <QUOTATION_MARK> ')
  text = text.replace("''", ' <QUOTATION_MARK> ')
  text = text.replace('...', ' <ELLIPSIS> ')
  text = text.replace("'", ' <APOSTROPH> ')
  text = text.replace('.', ' <PERIOD> ')
  text = text.replace(',', ' <COMMA> ')
  text = text.replace('"', ' <QUOTATION_MARK> ')
  text = text.replace(';', ' <SEMICOLON> ')
  text = text.replace(':', ' <COLON> ')
  text = text.replace('!', ' <EXCLAMATION_MARK> ')
  text = text.replace('?', ' <QUESTION_MARK> ')
  text = text.replace('(', ' <LEFT_PAREN> ')
  text = text.replace(')', ' <RIGHT_PAREN> ')
  text = text.replace('--', ' <HYPHENS> ')
  text = text.replace('\n', ' <NEW_LINE> ')
  # text = text.replace("[0-9]", ' N ') # this line doesn't work

  text = re.sub(r"[0-9]+", ' <NUM> ', text)

  # splits the text into individual words
  words = text.split()

  # remove words with five or fewer occurrences. reduce the size of the vocabulary and improve the efficiency of the model.
  word_counts = Counter(words)  # dictionary word:occurrences
  trimmed_words = [word for word in words if word_counts[word] > 5] # Remove any word fewer than 5 tokens

  # remove common words that do not provide much meaning to the text
  stop = [
  "a", "about", "above", "after", "again", "against", "all", "also", "altough", "am", "an", "and", "any", "are", "aren't", "as", "at",
  "b", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by",
  "c", "can", "can't", "cannot", "could", "couldn't",
  "d", "de", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during",
  "e", "each", "either", "even",
  "f", "few", "for", "from", "further",
  "g",
  "h", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's",
  "hers", "herself", "him", "himself", "his", "how", "how's", "however",
  "i", "i'd", "i'll", "i'm", "i've", "if", "ii", "in", "into", "is", "isn't", "it", "it's", "its", "itself",
  "j", "just",
  "k",
  "l", "like",
  "m", "many", "may", "me", "more", "most", "much", "must", "my", "myself",
  "n", "nd", "neither", "no", "nor", "not", "now",
  "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own",
  "p",
  "q",
  "r", "rd",
  "s", "same", "shall", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such",
  "t", "th", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they",
  "they'd", "they'll", "they're", "they've", "this", "those", "though", "through", "to", "too",
  "u", "under", "until", "up", "us",
  "v", "very",
  "w", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where",
  "where's", "which", "while", "who", "who's", "whom", "why", "why's", "will", "with", "won't", "would", "wouldn't",
  "x",
  "y", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves",
  "z",
  "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve"
  ]

  stop_trimmed_words = [w for w in trimmed_words if w not in stop]

  return stop_trimmed_words


def save_words(words, path):
  with open(path, "a") as f:
    for i in range(len(words)):
      word = words[i]
      f.write(word + ' ')

def save_lines(lines, path):
  with open(path, "a") as f:
    for i in range(len(lines)):
      line = lines[i] + '\n'
      f.write(line)

# 2 - Datasets

* **Enwik8** (100 MB): https://mattmahoney.net/dc/textdata.html
  - text extracted from English Wikipedia
* **Text8** (100 MB): https://mattmahoney.net/dc/textdata.html
  - preprocessing steps have been applied to Enwik8, such as:
    - tags, links, images, icons, tables, markup removal
    - lowercase convertion
    - punctuation removal
    - spaces removal
    - digits have been transformed into text
* **Penn Treebank** (ptb): https://paperswithcode.com/dataset/penn-treebank and https://huggingface.co/datasets/ptb_text_only


Since *Text8* doesn't have any punctuation, I decided to start from *Enwik8* and apply some of the preprocessing steps reported in the perl script at the bottom of https://mattmahoney.net/dc/textdata.html. I then applied further preprocessing steps. You can find the code below.

## Text8

In [None]:
text8_path = DRIVE_DATA_FOLDER + 'text8.txt'

with open(text8_path) as f:
  text8 = f.read()

text8_words = text8.split()


# text8 = get_text_stats(DRIVE_DATA_FOLDER, 'text8.txt')


print('******************')
print('* Dataset sample *')
print('******************\n')
with open(text8_path) as f:
  for i in range(0, 10):
    print(str(i) + ': ' + f.read(115))

print('\n*****************')
print('* Dataset stats *')
print('*****************\n')

print(f'Total words in text: {len(text8_words)}')
print(f'Unique words in text: {len(set(text8_words))}')

******************
* Dataset sample *
******************

0:  anarchism originated as a term of abuse first used against early working class radicals including the diggers of t
1: he english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative 
2: way to describe any act that used violent means to destroy the organization of society it has also been taken up as
3:  a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chi
4: ef king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished altho
5: ugh there are differing interpretations of what this means anarchism also refers to related social movements that a
6: dvocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists us
7: e it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place o

## Enwik8

In [None]:
enwik8_path = DRIVE_DATA_FOLDER + 'enwik8'

with open(enwik8_path) as f:
  enwik8 = f.read()

enwik8_words = enwik8.split()


# enwik8 = get_text_stats(DRIVE_DATA_FOLDER, 'enwik8')


print('******************')
print('* Dataset sample *')
print('******************\n')
with open(enwik8_path) as f:
  for i in range(0, 10):
    print(str(i) + ': ' + f.read(115))

print('\n*****************')
print('* Dataset stats *')
print('*****************\n')

print(f'Total words in text: {len(enwik8_words)}')
print(f'Unique words in text: {len(set(enwik8_words))}')

******************
* Dataset sample *
******************

0: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" x
1: si:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0
2: .3" xml:lang="en">
  <siteinfo>
    <sitename>Wikipedia</sitename>
    <base>http://en.wikipedia.org/wiki/Main_Page
3: </base>
    <generator>MediaWiki 1.6alpha</generator>
    <case>first-letter</case>
      <namespaces>
      <names
4: pace key="-2">Media</namespace>
      <namespace key="-1">Special</namespace>
      <namespace key="0" />
      <na
5: mespace key="1">Talk</namespace>
      <namespace key="2">User</namespace>
      <namespace key="3">User talk</name
6: space>
      <namespace key="4">Wikipedia</namespace>
      <namespace key="5">Wikipedia talk</namespace>
      <na
7: mespace key="6">Image</namespace>
      <namespace key="7">Image talk</namespace>
      <namespace key="8"

## Enwik8 after first part of the preprocessing

The **first part** of the preprocessing consists in:
* tags, links, images, icons, tables, markup removal
* lowercase convertion
* spaces removal


In [None]:
text_path = DRIVE_DATA_FOLDER + 'text_ok.txt'

with open(text_path) as f:
  text_ok = f.read()

text_words = text_ok.split()

# text = get_text_stats(DRIVE_DATA_FOLDER, 'text_ok.txt')


print('******************')
print('* Dataset sample *')
print('******************\n')
with open(text_path) as f:
  for i in range(0, 10):
    print(str(i) + ': ' + f.read(115))

print('\n*****************')
print('* Dataset stats *')
print('*****************\n')

print(f'Total words in text: {len(text_words)}')
print(f'Unique words in text: {len(set(text_words))}')

******************
* Dataset sample *
******************

0:  '''anarchism''' originated as a term of abuse first used against early working class radicals including the digger
1: s of the english revolution and the ''sans culottes'' of the french revolution. whilst the term is still used in a 
2: pejorative way to describe '' any act that used violent means to destroy the organization of society '', it has als
3: o been taken up as a positive label by self defined anarchists. the word '''anarchism''' is derived from the greek 
4: '' '' ( without archons (ruler, chief, king) ). anarchism as a political philosophy, is the belief that ''rulers'' 
5: are unnecessary and should be abolished, although there are differing interpretations of what this means. anarchism
6:  also refers to related social movements) that advocate the elimination of authoritarian institutions, particularly
7:  the state. the word anarchy, as most anarchists use it, does not imply chaos, nihilism, or anomie, but ra

# 3 - Preprocessing

The **second part** of the preprocessing consists in:

* numbers are substituted by `' N '`;
* punctuations is substituted by the relative tag;
* stop words removal;
* words with frequency lower than 5 are removed.

In [None]:
preprocessed_text_words = preprocess(text_ok)
save_words(preprocessed_text_words, PREPROCESSED_TEXT_PATH)


print('******************')
print('* Dataset sample *')
print('******************\n')
with open(PREPROCESSED_TEXT_PATH) as f:
  for i in range(0, 10):
    print(str(i) + ': ' + f.read(115))

print('\n*****************')
print('* Dataset stats *')
print('*****************\n')

print(f'Total words in text: {len(preprocessed_text_words)}')
print(f'Unique words in text: {len(set(preprocessed_text_words))}')

******************
* Dataset sample *
******************

0: <QUOTATION_MARK> anarchism <QUOTATION_MARK> originated term abuse first used early working class radicals including
1:  diggers english revolution <QUOTATION_MARK> sans <QUOTATION_MARK> french revolution <PERIOD> whilst term still use
2: d pejorative way describe <QUOTATION_MARK> act used violent means destroy organization society <QUOTATION_MARK> <CO
3: MMA> taken positive label self defined anarchists <PERIOD> word <QUOTATION_MARK> anarchism <QUOTATION_MARK> derived
4:  greek <QUOTATION_MARK> <QUOTATION_MARK> <LEFT_PAREN> without archons <LEFT_PAREN> ruler <COMMA> chief <COMMA> king
5:  <RIGHT_PAREN> <RIGHT_PAREN> <PERIOD> anarchism political philosophy <COMMA> belief <QUOTATION_MARK> rulers <QUOTAT
6: ION_MARK> unnecessary abolished <COMMA> although differing interpretations means <PERIOD> anarchism refers related 
7: social movements <RIGHT_PAREN> advocate elimination authoritarian institutions <COMMA> particularly state 

## Split into training and validation sets

In [None]:
# Read the dataset

text_path = DRIVE_DATA_FOLDER + 'enwik8-preprocessed.txt'

with open(text_path) as f:
  enwik8_preprocessed = f.read()

In [None]:
# Split the dataset into sentences

preprocessed = enwik8_preprocessed.replace('<PERIOD> ', '<PERIOD>\n')
preprocessed_lines = preprocessed.splitlines()
save_lines(preprocessed_lines, 'enwik8-preprocessed_with_lines.txt') # save file

print(f"The dataset has {len(preprocessed_lines)} lines.\n")

for i in range(0, 10):
  print(i, preprocessed_lines[i])

The dataset has 569331 lines.

0 <QUOTATION_MARK> anarchism <QUOTATION_MARK> originated term abuse first used early working class radicals including diggers english revolution <QUOTATION_MARK> sans <QUOTATION_MARK> french revolution <PERIOD>
1 whilst term still used pejorative way describe <QUOTATION_MARK> act used violent means destroy organization society <QUOTATION_MARK> <COMMA> taken positive label self defined anarchists <PERIOD>
2 word <QUOTATION_MARK> anarchism <QUOTATION_MARK> derived greek <QUOTATION_MARK> <QUOTATION_MARK> <LEFT_PAREN> without archons <LEFT_PAREN> ruler <COMMA> chief <COMMA> king <RIGHT_PAREN> <RIGHT_PAREN> <PERIOD>
3 anarchism political philosophy <COMMA> belief <QUOTATION_MARK> rulers <QUOTATION_MARK> unnecessary abolished <COMMA> although differing interpretations means <PERIOD>
4 anarchism refers related social movements <RIGHT_PAREN> advocate elimination authoritarian institutions <COMMA> particularly state <PERIOD>
5 word anarchy <COMMA> anarchists use <

In [None]:
# Divide the dataset into training and validation sets (80% and 20%)

train_num_lines = math.ceil(len(preprocessed_lines)*80/100)
val_num_lines = len(preprocessed_lines)*20//100


In [None]:
!split -l 455465 /content/enwik8-preprocessed_with_lines.txt

! cp /content/enwik8-preprocessed_with_lines.txt /content/drive/MyDrive/Uni/word2box/data/enwik8-preprocessed_with_lines.txt

! cp /content/xaa /content/drive/MyDrive/Uni/word2box/data/train.txt
! cp /content/xab /content/drive/MyDrive/Uni/word2box/data/val.txt

# 4 - Exploratory Data Analysis

In [None]:
text = get_text_stats(DRIVE_DATA_FOLDER, 'enwik8-preprocessed.txt')

## Word type frequency

In [None]:
freq_df = pd.DataFrame(text.vocab.freqs.items(), columns=['word_type','freq'])
freq_df

Unnamed: 0,word_type,freq
0,<QUOTATION_MARK>,333538
1,anarchism,242
2,originated,424
3,term,5554
4,abuse,467
...,...,...
53689,judogi,6
53690,octopussy,8
53691,mcclory,10
53692,glidrose,11


In [None]:
print(f'Maximun frequency: {freq_df["freq"].max()}')
print(f'Minimum frequency: {freq_df["freq"].min()}')
print(f'Average frequency: {freq_df["freq"].mean()}')
print()
print(f'Average Word Length: {average_word_length(freq_df, len(text.vocab))}')
print(f'Weighted Average Word Length: {weighted_average_word_length(freq_df)}')

Maximun frequency: 718889
Minimum frequency: 1
Average frequency: 164.97372145863596

Average Word Length: 7.418262067938022
Weighted Average Word Length: 7.1231570114535865


In [None]:
# Frequent words
frequent_words_idxs = freq_df['freq'].nlargest(20).index
freq_df.iloc[frequent_words_idxs]

Unnamed: 0,word_type,freq
29,<COMMA>,718889
17,<PERIOD>,569330
158,N,418395
0,<QUOTATION_MARK>,333538
45,<RIGHT_PAREN>,169757
39,<LEFT_PAREN>,169649
199,<APOSTROPH>,93119
301,<COLON>,84702
103,<SEMICOLON>,31773
5,first,20970


In [None]:
# Rare words
rare_words_idxs = freq_df['freq'].nsmallest(20).index
freq_df.iloc[rare_words_idxs]

Unnamed: 0,word_type,freq
53693,<eos>,1
147,citium,6
152,regimentation,6
233,girondins,6
355,communitarian,6
489,climaxed,6
808,realign,6
1157,luddites,6
1215,leftism,6
1362,agitators,6


In [None]:
rare_words_idxs = freq_df[freq_df['freq']>1]['freq'].nsmallest(100).index
freq_df.iloc[rare_words_idxs]

Unnamed: 0,word_type,freq
147,citium,6
152,regimentation,6
233,girondins,6
355,communitarian,6
489,climaxed,6
...,...,...
8737,archaeologically,6
8760,willey,6
8761,viru,6
8830,topsoil,6


In [None]:
rare_words_idxs = freq_df[freq_df['freq']>6]['freq'].nsmallest(100).index
freq_df.iloc[rare_words_idxs]

Unnamed: 0,word_type,freq
41,archons,7
185,communistic,7
383,expound,7
650,cgt,7
894,collectivized,7
...,...,...
8532,fleshed,7
8562,reductive,7
8563,randal,7
8638,sociolinguistics,7
