In [1]:
import nltk
import pylangacq
import pandas as pd
import numpy as np

from nltk.corpus.reader.bnc import BNCCorpusReader

## BNC Full: 100m, 5.7m, 2.4m, 1m

In [2]:
# download BNC Full
bnc_full_reader = BNCCorpusReader(root='./corpora/BNC/2554.zip/download/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
bnc_full = bnc_full_reader.words()

len(bnc_full)

111978070

In [3]:
# sample for 100m, 5.7m, 2.4m, 1m
# bnc_100m = np.random.choice(bnc_full, 100000000)
# bnc_5_7m = np.random.choice(bnc_full, 5700000)
# bnc_2_4m = np.random.choice(bnc_full, 2400000)
bnc_1m = np.random.choice(bnc_full, 1000000)

MemoryError: Unable to allocate 54.6 GiB for an array with shape (111978070,) and data type <U131

In [4]:
# alternative method to sample
bnc_df = pd.Series(bnc_full)


In [15]:
random_seed = 7
bnc_100m = bnc_df.sample(n=100000000, random_state=random_seed)
bnc_5_7m = bnc_df.sample(n=5700000, random_state=random_seed)
bnc_2_4m = bnc_df.sample(n=2400000, random_state=random_seed)
bnc_1m = bnc_df.sample(n=1000000, random_state=random_seed)

bnc_100m

53082915           of
35715265         week
46710596    customers
3965870        couple
96558773         have
              ...    
62046431     sections
48730368         turn
14668221       Please
9418446          1991
25462325          and
Length: 2400000, dtype: object

In [17]:
# create a word count from dictionary into dataframe
bnc_100m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_100m)
bnc_100m_word_count_df = pd.DataFrame.from_dict(bnc_100m_word_count_dict, orient='index').reset_index()
bnc_100m_word_count_df = bnc_100m_word_count_df.rename(columns={'index':'norm', 0:'BNC_100m_Count'})

bnc_5_7m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_5_7m)
bnc_5_7m_word_count_df = pd.DataFrame.from_dict(bnc_5_7m_word_count_dict, orient='index').reset_index()
bnc_5_7m_word_count_df = bnc_5_7m_word_count_df.rename(columns={'index':'norm', 0:'BNC_5_7m_Count'})

bnc_2_4m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_2_4m)
bnc_2_4m_word_count_df = pd.DataFrame.from_dict(bnc_2_4m_word_count_dict, orient='index').reset_index()
bnc_2_4m_word_count_df = bnc_2_4m_word_count_df.rename(columns={'index':'norm', 0:'BNC_2_4m_Count'})

bnc_1m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_1m)
bnc_1m_word_count_df = pd.DataFrame.from_dict(bnc_1m_word_count_dict, orient='index').reset_index()
bnc_1m_word_count_df = bnc_1m_word_count_df.rename(columns={'index':'norm', 0:'BNC_1m_Count'})

bnc_100m_word_count_df

Unnamed: 0,norm,BNC_100m_Count
0,of,2717776
1,week,28179
2,customers,5982
3,couple,10612
4,have,411432
...,...,...
605424,al-noman,1
605425,pre-historically,1
605426,sub-programmes,1
605427,mash-style,1


In [18]:
bnc_100m_word_count_df.to_csv('./sampled_count/BNC_100m_Count.csv', index=False)
bnc_5_7m_word_count_df.to_csv('./sampled_count/BNC_5_7m_Count.csv', index=False)
bnc_2_4m_word_count_df.to_csv('./sampled_count/BNC_2_4m_Count.csv', index=False)
bnc_1m_word_count_df.to_csv('./sampled_count/BNC_1m_Count.csv', index=False)

## CHILDES: 5.7m, 2.4m, 1m

In [3]:
# load sub-corporas
brown = pylangacq.read_chat("./corpora/CHILDES/Brown.zip")
belf = pylangacq.read_chat("./corpora/CHILDES/Belfast.zip")
crutt=pylangacq.read_chat("./corpora/CHILDES/Cruttenden.zip")
fletcher=pylangacq.read_chat("./corpora/CHILDES/Fletcher.zip")
forr=pylangacq.read_chat("./corpora/CHILDES/Forrester.zip")
gath=pylangacq.read_chat("./corpora/CHILDES/Gathburn.zip")
howe=pylangacq.read_chat("./corpora/CHILDES/Howe.zip")
kelly=pylangacq.read_chat("./corpora/CHILDES/KellyQuigley.zip")
korman=pylangacq.read_chat("./corpora/CHILDES/Korman.zip")
lara=pylangacq.read_chat("./corpora/CHILDES/Lara.zip")
manc=pylangacq.read_chat("./corpora/CHILDES/Manchester.zip")
nuff=pylangacq.read_chat("./corpora/CHILDES/Nuffield.zip")
quigley=pylangacq.read_chat("./corpora/CHILDES/QuigleyMcNally.zip")
sekali=pylangacq.read_chat("./corpora/CHILDES/Sekali.zip")
smith=pylangacq.read_chat("./corpora/CHILDES/Smith.zip")
tommer=pylangacq.read_chat("./corpora/CHILDES/Tommerdahl.zip")


In [4]:
# link up sub-corporas
corpora = [brown, belf, crutt, fletcher, forr, gath, howe, kelly, korman, lara, manc, nuff, quigley, sekali, smith, tommer]
reader = pylangacq.Reader()
for item in corpora:
    reader.append(item)

In [5]:
# sample for 5.7m, 2.4m, 1m
words = reader.words()
childes_5_7m = np.random.choice(words, 5700000)
childes_2_4m = np.random.choice(words, 2400000)
childes_1m = np.random.choice(words, 1000000)

len(words)

5681157

In [6]:
# create a word count from dictionary into dataframe
# addition: convert all words into low case
childes_5_7m_freq = nltk.FreqDist(word.lower() for word in childes_5_7m)
childes_5_7m_df = pd.DataFrame.from_dict(childes_5_7m_freq, orient='index').reset_index().rename(columns={0: 'CHILDES_Count'})

childes_2_4m_freq = nltk.FreqDist(word.lower() for word in childes_2_4m)
childes_2_4m_df = pd.DataFrame.from_dict(childes_2_4m_freq, orient='index').reset_index().rename(columns={0: 'CHILDES_Count'})

childes_1m_freq = nltk.FreqDist(word.lower() for word in childes_1m)
childes_1m_df = pd.DataFrame.from_dict(childes_1m_freq, orient='index').reset_index().rename(columns={0: 'CHILDES_Count'})

childes_1m_df

Unnamed: 0,index,CHILDES_Count
0,you're,2276
1,in,8836
2,is,9141
3,again,1287
4,?,52714
...,...,...
13341,medical,1
13342,ragdoll,1
13343,shop's,1
13344,lisbane,1


In [8]:
childes_5_7m_df.to_csv('./sampled_count/CHILDES_5_7m_Count.csv', index=False)
childes_2_4m_df.to_csv('./sampled_count/CHILDES_2_4m_Count.csv', index=False)
childes_1m_df.to_csv('./sampled_count/CHILDES_1m_Count.csv', index=False)

## CABNC: 2.4m, 1m

In [20]:
# download CABNC Full
CABNC_full = pylangacq.read_chat('./corpora/CABNC/CABNC.zip')
words = CABNC_full.words()

len(words)

2411853

In [21]:
# sample for 2.4m, 1m
cabnc_2_4m = np.random.choice(words, 2400000)
cabnc_1m = np.random.choice(words, 1000000)

len(cabnc_2_4m)

2400000

In [22]:
# create a word count from dictionary into dataframe
cabnc_1m_freq = nltk.FreqDist(word.lower() for word in cabnc_1m)
cabnc_1m_df = pd.DataFrame.from_dict(cabnc_1m_freq, orient='index').reset_index().rename(columns={0: 'CABNC_Count'})

cabnc_2_4m_freq = nltk.FreqDist(word.lower() for word in cabnc_2_4m)
cabnc_2_4m_df = pd.DataFrame.from_dict(cabnc_2_4m_freq, orient='index').reset_index().rename(columns={0: 'CABNC_Count'})

cabnc_2_4m_df

Unnamed: 0,index,CABNC_Count
0,rang,136
1,.,256556
2,single,143
3,'ll,9669
4,be,10164
...,...,...
20997,motto,1
20998,lan,1
20999,shuttlings,1
21000,fouling,1


In [24]:
cabnc_2_4m_df.to_csv('./sampled_count/CABNC_2_4m_Count.csv', index=False)
cabnc_1m_df.to_csv('./sampled_count/CABNC_1m_Count.csv', index=False)

## KBNC: 1m

In [16]:
# extract all child audience from BNC, known as KBNC (according to ./BNC_WORLD_INDEX.XLS)
aud_child = ['A/A7/A7A.xml','A/AB/ABX.xml','A/AC/AC5.xml','A/AC/AC4.xml','A/AC/ACB.xml','A/AC/ACV.xml','A/AE/AEB.xml','A/AL/ALS.xml','A/AM/AMB.xml','A/AP/APW.xml','A/AT/AT4.xml','B/B0/B0B.xml','B/B1/B1S.xml','B/B2/B2V.xml','B/B2/B2N.xml','B/BM/BMU.xml','B/BP/BPD.xml','C/C8/C85.xml','C/C8/C8N.xml','C/CA/CA3.xml','C/CA/CAB.xml','C/CA/CAX.xml','C/CC/CCA.xml','C/CE/CE0.xml','C/CE/CEU.xml','C/CF/CFJ.xml','C/CH/CH4.xml','C/CH/CH9.xml','C/CH/CHR.xml','E/EF/EFJ.xml','F/FE/FEH.xml','F/FN/FNS.xml','F/FP/FPT.xml','F/FP/FPV.xml','F/FS/FSL.xml','F/FU/FUB.xml','G/G2/G22.xml','G/G2/G23.xml','G/G2/G24.xml','G/G2/G25.xml','H/H9/H93.xml','H/H9/H9E.xml']
kbnc_reader = BNCCorpusReader(root='./corpora/BNC/2554.zip/download/Texts/', fileids=aud_child)
kbnc = kbnc_reader.words()

len(kbnc)

1098264

In [17]:
# sample for 1m
kbnc_1m = np.random.choice(kbnc, size=1000000, replace=False)

In [18]:
# create a word count from dictionary into dataframe
kbnc_1m_word_count_dict = nltk.FreqDist(word.lower() for word in kbnc_1m)
kbnc_1m_word_count_df = pd.DataFrame.from_dict(kbnc_1m_word_count_dict, orient='index').reset_index()
kbnc_1m_word_count_df = kbnc_1m_word_count_df.rename(columns={'index':'norm', 0:'KBNC_1m_Count'})

kbnc_1m_word_count_df

Unnamed: 0,norm,KBNC_1m_Count
0,lonely,67
1,as,4560
2,!,5683
3,night,459
4,a,17707
...,...,...
25694,saddle-brown,1
25695,fibre,1
25696,meshes,1
25697,jeweller,1


In [19]:
kbnc_1m_word_count_df.to_csv('./sampled_count/KBNC_1m_Count.csv', index=False)