In [1]:
import os
import pandas as pd
import pdpipe as pdp
import spacy
import sys

from IPython.display import display
from os.path import join
from pathlib import Path

project_root = Path('..')
sys.path.append(os.path.abspath(project_root))
from notebooks.utils import init_data_dir  # noqa

from notebooks import pipes  # noqa

init_data_dir(project_root)

preprocess_path = join(project_root, Path('data/preprocess'))

nlp = spacy.load('en_core_web_sm')

df = pd.read_hdf(join(preprocess_path, Path('bawe_df.hdf5')))

# Information for the British Academic Written English Corpus

This notebook is for gathering information about the BAWE dataset. The dataset should already be parsed and stored in `data/preprocess/bawe_df.hdf5`.

In [2]:
df

Unnamed: 0,author,genre,text
0,1,a,Racism is still a problem within our society t...
1,1,b,Official statistics are those produced by eith...
2,1,c,Since the fourteenth century the practice of m...
3,1,d,Much more reproductive choice is now available...
4,2,a,Victorian notions of women's madness were larg...
...,...,...,...
2572,6998,a,E. Warwick Slinn describes dramatic monologue ...
2573,6998,b,Hugh Blair voices an attack on the practices o...
2574,6998,c,"'The first thing to remember about Donne,' wri..."
2575,6998,d,Susan Wiseman calculated that the latest possi...


In [3]:
resample_splits = True

train_df_path = join(preprocess_path, 'bawe_train_df.hdf5')
valid_df_path = join(preprocess_path, 'bawe_valid_df.hdf5')

train_df_exists = os.path.exists(train_df_path)
valid_df_exists = os.path.exists(valid_df_path)

if not (train_df_exists and valid_df_exists) or resample_splits:
    print('Resampling...')

    train_df = df.sample(frac=0.5).sort_values(by=['author', 'genre'])
    valid_df = df.drop(train_df.index)

    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)

    train_df.to_hdf(train_df_path, key='bawe_train_df')
    valid_df.to_hdf(valid_df_path, key='bawe_valid_df')
else:
    train_df = pd.read_hdf(train_df_path)
    valid_df = pd.read_hdf(valid_df_path)

print('Train Set:')
display(train_df)
print('Validation Set:')
display(valid_df)

Resampling...
Train Set:


Unnamed: 0,author,genre,text
0,1,a,Racism is still a problem within our society t...
1,1,c,Since the fourteenth century the practice of m...
2,2,a,Victorian notions of women's madness were larg...
3,2,b,The development of feminist thought and action...
4,3,f,While it is certain that the reformation which...
...,...,...,...
1283,6214,b,<abstract><heading>Abstract</heading>Aquaponic...
1284,6215,c,<heading>INTRODUCTION</heading>Seed quality in...
1285,6215,d,<heading>Question 1 Forest Growth Rates</headi...
1286,6215,f,<list></list><abstract><heading>Abstract</head...


Validation Set:


Unnamed: 0,author,genre,text
0,1,b,Official statistics are those produced by eith...
1,1,d,Much more reproductive choice is now available...
2,3,a,"The publication of The Bell Curve in 1994, the..."
3,3,b,"On Friday afternoon, the 26 th of November, I ..."
4,3,c,"The nineteenth century was, until recently, pr..."
...,...,...,...
1284,6215,e,<heading>INTRODUCTION</heading>Both Macrochloa...
1285,6998,a,E. Warwick Slinn describes dramatic monologue ...
1286,6998,b,Hugh Blair voices an attack on the practices o...
1287,6998,d,Susan Wiseman calculated that the latest possi...


In [None]:
pipeline = pdp.PdPipeline([pipes.IDText(),
                           pipes.SplitText(nlp, show_loading=True)])

train_df = pipeline(train_df)
valid_df = pipeline(valid_df)

print('Train set:', flush=True)
display(train_df)
print('Validation set:', flush=True)
display(valid_df)

 61%|######    | 783/1288 [03:50<01:58,  4.28it/s]

In [None]:
train_df.to_hdf(join(preprocess_path, 'bawe_train_sentences.hdf5'),
                key='bawe_train_sentences')
valid_df.to_hdf(join(preprocess_path, 'bawe_valid_sentences.hdf5'),
                key='bawe_valid_sentences')

In [None]:
237192 / 20