In [1]:
import os
import pandas as pd
import pdpipe as pdp
import spacy
import sys

from IPython.display import display
from os.path import join
from pathlib import Path

project_root = Path('..')
sys.path.append(os.path.abspath(project_root))
from notebooks.utils import init_data_dir  # noqa

from notebooks import pipes  # noqa

init_data_dir(project_root)

preprocess_path = join(project_root, Path('data/preprocess'))

nlp = spacy.load('en_core_web_sm')

df = pd.read_hdf(join(preprocess_path, Path('bawe_df.hdf5')))

# Information for the British Academic Written English Corpus

This notebook is for gathering information about the BAWE dataset. The dataset should already be parsed and stored in `data/preprocess/bawe_df.hdf5`.

In [2]:
df

Unnamed: 0,author,genre,text
0,1,a,Racism is still a problem within our society t...
1,1,b,Official statistics are those produced by eith...
2,1,c,Since the fourteenth century the practice of m...
3,1,d,Much more reproductive choice is now available...
4,2,a,Victorian notions of women's madness were larg...
...,...,...,...
2572,6998,a,E. Warwick Slinn describes dramatic monologue ...
2573,6998,b,Hugh Blair voices an attack on the practices o...
2574,6998,c,"'The first thing to remember about Donne,' wri..."
2575,6998,d,Susan Wiseman calculated that the latest possi...


In [3]:
resample_splits = True

train_df_path = join(preprocess_path, 'bawe_train_df.hdf5')
valid_df_path = join(preprocess_path, 'bawe_valid_df.hdf5')

train_df_exists = os.path.exists(train_df_path)
valid_df_exists = os.path.exists(valid_df_path)

if not (train_df_exists and valid_df_exists) or resample_splits:
    print('Resampling...')

    train_df = df.sample(frac=0.8).sort_values(by=['author', 'genre'])
    valid_df = df.drop(train_df.index)

    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)

    train_df.to_hdf(train_df_path, key='bawe_train_df')
    valid_df.to_hdf(valid_df_path, key='bawe_valid_df')
else:
    train_df = pd.read_hdf(train_df_path)
    valid_df = pd.read_hdf(valid_df_path)

print('Train Set:')
display(train_df)
print('Validation Set:')
display(valid_df)

Resampling...
Train Set:


Unnamed: 0,author,genre,text
0,1,a,Racism is still a problem within our society t...
1,1,b,Official statistics are those produced by eith...
2,1,c,Since the fourteenth century the practice of m...
3,1,d,Much more reproductive choice is now available...
4,2,a,Victorian notions of women's madness were larg...
...,...,...,...
2057,6215,e,<heading>INTRODUCTION</heading>Both Macrochloa...
2058,6215,f,<list></list><abstract><heading>Abstract</head...
2059,6998,c,"'The first thing to remember about Donne,' wri..."
2060,6998,d,Susan Wiseman calculated that the latest possi...


Validation Set:


Unnamed: 0,author,genre,text
0,3,d,"At the eve of the Versailles conference, promo..."
1,3,k,<heading>Introduction</heading>When we talk ab...
2,7,a,<abstract><heading>Summary</heading>The experi...
3,9,b,<heading>Introduction</heading>The study of gr...
4,10,d,The assumption that a gap existed between the ...
...,...,...,...
510,6212,d,<heading>Introduction</heading>The reaction be...
511,6215,c,<heading>INTRODUCTION</heading>Seed quality in...
512,6215,d,<heading>Question 1 Forest Growth Rates</headi...
513,6998,a,E. Warwick Slinn describes dramatic monologue ...


In [4]:
pipeline = pdp.PdPipeline([pipes.IDText(),
                           pipes.SplitText(nlp, show_loading=True)])

train_df = pipeline(train_df)
valid_df = pipeline(valid_df)

print('Train set:', flush=True)
display(train_df)
print('Validation set:', flush=True)
display(valid_df)

100%|##########| 2062/2062 [09:32<00:00,  2.59it/s]
  0%|          | 0/515 [00:00<?, ?it/s][A
  0%|          | 1/515 [00:00<02:53,  2.96it/s][A
  0%|          | 2/515 [00:01<05:12,  1.64it/s][A
  1%|          | 3/515 [00:01<03:24,  2.50it/s][A
  1%|          | 4/515 [00:01<02:34,  3.32it/s][A
  1%|          | 5/515 [00:01<02:43,  3.12it/s][A
  1%|1         | 6/515 [00:01<02:18,  3.68it/s][A
  1%|1         | 7/515 [00:02<01:49,  4.63it/s][A
  2%|1         | 8/515 [00:02<02:01,  4.18it/s][A
  2%|1         | 9/515 [00:02<01:52,  4.48it/s][A
  2%|1         | 10/515 [00:02<02:20,  3.58it/s][A
  2%|2         | 11/515 [00:03<02:12,  3.80it/s][A
  2%|2         | 12/515 [00:03<02:15,  3.71it/s][A
  3%|2         | 13/515 [00:03<01:58,  4.22it/s][A
  3%|2         | 14/515 [00:03<02:03,  4.04it/s][A
  3%|2         | 15/515 [00:04<01:51,  4.50it/s][A
  3%|3         | 16/515 [00:04<01:54,  4.37it/s][A
  3%|3         | 17/515 [00:04<02:04,  4.00it/s][A
  3%|3         | 18/515 [00:04

Train set:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sentence
author,text_id,sentence_position,Unnamed: 3_level_1
1,0,0,Racism is still a problem within our society t...
1,0,1,Ethnic minorities are concentrated into certai...
1,0,2,"This can be seen in a survey, carried out in 1..."
1,0,3,"In this essay I will look at what racism is, a..."
1,0,4,In this section I will cover three areas that ...
...,...,...,...
6998,2,175,"Yerma never wants the life of a man, yet she i..."
6998,2,176,She resents her ability to exercise new freedo...
6998,2,177,"However, Hedda resents her inactivity and depe..."
6998,2,178,She rejects motherhood and matrimony in favour...


Validation set:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sentence
author,text_id,sentence_position,Unnamed: 3_level_1
3,0,0,"At the eve of the Versailles conference, promo..."
3,0,1,"Moving with caution, Piłsudski, Head of State ..."
3,0,2,"From the outset, it appeared that Poland's ind..."
3,0,3,It was only the realisation of Soviet hostilit...
3,0,4,At nearly the same time that peace was conclud...
...,...,...,...
6998,1,70,(67-69)</quote>
6998,1,71,"By allowing the child the last word, Wordswort..."
6998,1,72,Wordsworth challenges the reader's perceptions...
6998,1,73,The poet asks if the child's cheerful percepti...


In [5]:
train_df.to_hdf(join(preprocess_path, 'bawe_train_sentences.hdf5'),
                key='bawe_train_sentences')
valid_df.to_hdf(join(preprocess_path, 'bawe_valid_sentences.hdf5'),
                key='bawe_valid_sentences')

In [6]:
237192 / 20

11859.6