In [1]:
import os
import pandas as pd
import pdpipe as pdp
import spacy
import sys

from IPython.display import display
from os.path import join
from pathlib import Path

project_root = Path('..')
sys.path.append(os.path.abspath(project_root))
from notebooks.utils import init_data_dir  # noqa

from notebooks import pipes  # noqa

init_data_dir(project_root)

preprocess_path = join(project_root, Path('data/preprocess'))

nlp = spacy.load('en_core_web_sm')

df = pd.read_hdf(join(preprocess_path, Path('bawe_df.hdf5')))

# Information for the British Academic Written English Corpus

This notebook is for gathering information about the BAWE dataset. The dataset should already be parsed and stored in `data/preprocess/bawe_df.hdf5`.

In [2]:
df

Unnamed: 0,author,genre,text,sentence_count
0,1,a,Racism is still a problem within our society t...,48
1,1,b,Official statistics are those produced by eith...,46
2,1,c,Since the fourteenth century the practice of m...,56
3,1,d,Much more reproductive choice is now available...,67
4,2,a,Victorian notions of women's madness were larg...,58
...,...,...,...,...
2572,6998,a,E. Warwick Slinn describes dramatic monologue ...,149
2573,6998,b,Hugh Blair voices an attack on the practices o...,75
2574,6998,c,"'The first thing to remember about Donne,' wri...",115
2575,6998,d,Susan Wiseman calculated that the latest possi...,116


In [3]:
resample_splits = False

train_df_path = join(preprocess_path, 'bawe_train_df.hdf5')
valid_df_path = join(preprocess_path, 'bawe_valid_df.hdf5')

train_df_exists = os.path.exists(train_df_path)
valid_df_exists = os.path.exists(valid_df_path)

if not (train_df_exists and valid_df_exists) or resample_splits:
    print('Resampling...')

    train_df = df.sample(frac=0.8).sort_values(by=['author', 'genre'])
    valid_df = df.drop(train_df.index)

    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)

    train_df.to_hdf(train_df_path, key='bawe_train_df')
    valid_df.to_hdf(valid_df_path, key='bawe_valid_df')
else:
    train_df = pd.read_hdf(train_df_path)
    valid_df = pd.read_hdf(valid_df_path)

print('Train Set:')
display(train_df)
print('Validation Set:')
display(valid_df)

Train Set:


Unnamed: 0,author,genre,text,sentence_count
0,1,a,Racism is still a problem within our society t...,48
1,1,b,Official statistics are those produced by eith...,46
2,1,c,Since the fourteenth century the practice of m...,56
3,1,d,Much more reproductive choice is now available...,67
4,2,a,Victorian notions of women's madness were larg...,58
...,...,...,...,...
2057,6215,e,<heading>INTRODUCTION</heading>Both Macrochloa...,55
2058,6998,b,Hugh Blair voices an attack on the practices o...,75
2059,6998,c,"'The first thing to remember about Donne,' wri...",115
2060,6998,d,Susan Wiseman calculated that the latest possi...,116


Validation Set:


Unnamed: 0,author,genre,text,sentence_count
0,3,l,The advent of the Solidarity strikes and prote...,139
1,5,a,The once fashionable belief that class analysi...,156
2,6,a,<abstract><heading>Aim / Abstract</heading>The...,62
3,7,a,<abstract><heading>Summary</heading>The experi...,56
4,8,b,<abstract><heading>Summary</heading>The purpos...,102
...,...,...,...,...
510,6210,j,<heading>Introduction:</heading>Polymers based...,66
511,6211,h,"The world of Odysseus by M. I. Finley, (hencef...",46
512,6214,e,An increasing demand for organically produced ...,79
513,6215,f,<list></list><abstract><heading>Abstract</head...,311


In [4]:
pipeline = pdp.PdPipeline([pipes.IDText(),
                           pipes.SplitText(nlp, show_loading=True)])

train_df = pipeline(train_df)
valid_df = pipeline(valid_df)

print('Train set:')
display(train_df)
print('Validation set:')
display(valid_df)

100%|██████████| 2062/2062 [09:14<00:00,  3.72it/s]
100%|██████████| 515/515 [02:12<00:00,  2.92it/s]

Train set:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sentence
author,text_id,sentence_position,Unnamed: 3_level_1
1,0,0,Racism is still a problem within our society t...
1,0,1,Ethnic minorities are concentrated into certai...
1,0,2,"This can be seen in a survey, carried out in 1..."
1,0,3,"In this essay I will look at what racism is, a..."
1,0,4,In this section I will cover three areas that ...
...,...,...,...
6998,3,175,"Yerma never wants the life of a man, yet she i..."
6998,3,176,She resents her ability to exercise new freedo...
6998,3,177,"However, Hedda resents her inactivity and depe..."
6998,3,178,She rejects motherhood and matrimony in favour...


Validation set:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sentence
author,text_id,sentence_position,Unnamed: 3_level_1
3,0,0,The advent of the Solidarity strikes and prote...
3,0,1,Whilst it provided a powerful response of work...
3,0,2,Where from came this powerful articulation of ...
3,0,3,"To answer this question, it may be fruitful to..."
3,0,4,Economic and political developments throughout...
...,...,...,...
6998,0,144,The task of women writers is particularly rema...
6998,0,145,"For example, Dante Gabriel Rossetti uses the p..."
6998,0,146,Men and women are united in their concern for ...
6998,0,147,The dramatic monologue requires the writer to ...


In [5]:
train_df.to_hdf(join(preprocess_path, 'bawe_train_sentences.hdf5'),
                key='bawe_train_sentences')
valid_df.to_hdf(join(preprocess_path, 'bawe_valid_sentences.hdf5'),
                key='bawe_valid_sentences')

In [10]:
237192 / 20

11859.6