In [1]:
import os
import pandas as pd
import pdpipe as pdp
import spacy
import sys

from IPython.display import display
from os.path import join
from pathlib import Path

project_root = Path('..')
sys.path.append(os.path.abspath(project_root))
from notebooks.utils import init_data_dir  # noqa

from notebooks import pipes  # noqa

init_data_dir(project_root)

preprocess_path = join(project_root, Path('data/preprocess'))

nlp = spacy.load('en_core_web_sm')

df = pd.read_hdf(join(preprocess_path, Path('bawe_df.hdf5')))

# Information for the British Academic Written English Corpus

This notebook is for gathering information about the BAWE dataset. The dataset should already be parsed and stored in `data/preprocess/bawe_df.hdf5`.

In [2]:
df

Unnamed: 0,author,genre,text
0,1,a,Racism is still a problem within our society t...
1,1,b,Official statistics are those produced by eith...
2,1,c,Since the fourteenth century the practice of m...
3,1,d,Much more reproductive choice is now available...
4,2,a,Victorian notions of women's madness were larg...
...,...,...,...
2572,6998,a,E. Warwick Slinn describes dramatic monologue ...
2573,6998,b,Hugh Blair voices an attack on the practices o...
2574,6998,c,"'The first thing to remember about Donne,' wri..."
2575,6998,d,Susan Wiseman calculated that the latest possi...


In [3]:
resample_splits = True

train_df_path = join(preprocess_path, 'bawe_train_df.hdf5')
valid_df_path = join(preprocess_path, 'bawe_valid_df.hdf5')

train_df_exists = os.path.exists(train_df_path)
valid_df_exists = os.path.exists(valid_df_path)

if not (train_df_exists and valid_df_exists) or resample_splits:
    print('Resampling...')

    train_df = df.sample(frac=0.8).sort_values(by=['author', 'genre'])
    valid_df = df.drop(train_df.index)

    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)

    train_df.to_hdf(train_df_path, key='bawe_train_df')
    valid_df.to_hdf(valid_df_path, key='bawe_valid_df')
else:
    train_df = pd.read_hdf(train_df_path)
    valid_df = pd.read_hdf(valid_df_path)

print('Train Set:')
display(train_df)
print('Validation Set:')
display(valid_df)

Resampling...
Train Set:


Unnamed: 0,author,genre,text
0,1,a,Racism is still a problem within our society t...
1,1,c,Since the fourteenth century the practice of m...
2,2,a,Victorian notions of women's madness were larg...
3,2,b,The development of feminist thought and action...
4,3,b,"On Friday afternoon, the 26 th of November, I ..."
...,...,...,...
2057,6215,e,<heading>INTRODUCTION</heading>Both Macrochloa...
2058,6215,f,<list></list><abstract><heading>Abstract</head...
2059,6998,a,E. Warwick Slinn describes dramatic monologue ...
2060,6998,c,"'The first thing to remember about Donne,' wri..."


Validation Set:


Unnamed: 0,author,genre,text
0,1,b,Official statistics are those produced by eith...
1,1,d,Much more reproductive choice is now available...
2,3,a,"The publication of The Bell Curve in 1994, the..."
3,3,h,The emergence of the sociological school of et...
4,5,c,For too long the history of the Cold War has b...
...,...,...,...
510,6211,i,This essay is essentially a reaction to an art...
511,6211,j,The Carolingian Renaissance is simultaneously ...
512,6214,c,<heading>Forest Growth Rates</heading><table/>...
513,6998,b,Hugh Blair voices an attack on the practices o...


In [4]:
pipeline = pdp.PdPipeline([pipes.IDText(),
                           pipes.SplitText(nlp, show_loading=True)])

train_df = pipeline(train_df)
valid_df = pipeline(valid_df)

print('Train set:', flush=True)
display(train_df)
print('Validation set:', flush=True)
display(valid_df)

100%|██████████| 2062/2062 [08:22<00:00,  2.97it/s]
  0%|          | 0/515 [00:00<?, ?it/s][A
  0%|          | 1/515 [00:00<01:09,  7.43it/s][A
  0%|          | 2/515 [00:00<01:32,  5.55it/s][A
  1%|          | 3/515 [00:00<01:26,  5.90it/s][A
  1%|          | 4/515 [00:00<01:44,  4.87it/s][A
  1%|          | 5/515 [00:01<03:02,  2.79it/s][A
  1%|          | 6/515 [00:01<02:22,  3.57it/s][A
  1%|▏         | 7/515 [00:01<01:57,  4.34it/s][A
  2%|▏         | 8/515 [00:01<01:51,  4.56it/s][A
  2%|▏         | 9/515 [00:02<01:42,  4.94it/s][A
  2%|▏         | 10/515 [00:02<01:57,  4.29it/s][A
  2%|▏         | 11/515 [00:02<02:43,  3.09it/s][A
  2%|▏         | 12/515 [00:02<02:14,  3.75it/s][A
  3%|▎         | 13/515 [00:03<01:57,  4.29it/s][A
  3%|▎         | 14/515 [00:03<01:52,  4.45it/s][A
  3%|▎         | 15/515 [00:03<01:53,  4.39it/s][A
  3%|▎         | 16/515 [00:03<01:50,  4.53it/s][A
  3%|▎         | 17/515 [00:04<01:55,  4.33it/s][A
  3%|▎         | 18/515 [00:04

Train set:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sentence
author,text_id,sentence_position,Unnamed: 3_level_1
1,0,0,Racism is still a problem within our society t...
1,0,1,Ethnic minorities are concentrated into certai...
1,0,2,"This can be seen in a survey, carried out in 1..."
1,0,3,"In this essay I will look at what racism is, a..."
1,0,4,In this section I will cover three areas that ...
...,...,...,...
6998,2,175,"Yerma never wants the life of a man, yet she i..."
6998,2,176,She resents her ability to exercise new freedo...
6998,2,177,"However, Hedda resents her inactivity and depe..."
6998,2,178,She rejects motherhood and matrimony in favour...


Validation set:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sentence
author,text_id,sentence_position,Unnamed: 3_level_1
1,0,0,Official statistics are those produced by eith...
1,0,1,There has been a huge increase in the number o...
1,0,2,Although they are widely used there are many l...
1,0,3,In this essay I plan to talk about how the int...
1,0,4,I will also explore the uses that statistics d...
...,...,...,...
6998,1,111,"As a persuasive text, The Famous Tragedie's cl..."
6998,1,112,The writer chose to express Parliament's destr...
6998,1,113,"However, this potential is lost amongst sensat..."
6998,1,114,While it may have been necessary to produce sh...


In [5]:
train_df.to_hdf(join(preprocess_path, 'bawe_train_sentences.hdf5'),
                key='bawe_train_sentences')
valid_df.to_hdf(join(preprocess_path, 'bawe_valid_sentences.hdf5'),
                key='bawe_valid_sentences')

In [6]:
237192 / 20

11859.6