In [1]:
# loading required libraries
import nltk, pickle, pprint, csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# pretty printing for readability
cp = pprint.PrettyPrinter(compact=True, sort_dicts=True)

## Learner corpus

I will utilize the COREFL, or "CORpus of English as a Foreign Language". It contains written and spoken data, but this analysis will only include spoken. The participants are adult learners with two different L1 backgrounds (Spanish and German). 

### Data import (Learner)

In [2]:
Lraw= pd.read_csv('../data/COREFL.csv', sep='\t') # importing COREFL spoken learner data

In [3]:
Lraw.head() # displaying first rows

Unnamed: 0,Subcorpus,Filename,Year data collection,Placement test score (raw),Placement test score (%),Proficiency,Sex,Age,School/University/Institution,Major,...,Proficiency (self-assessment) in additional language writing,Medium,Task number,Task title,Writing/audio details,Minutes taken to complete the task,Where the task was done,Resources used,Text,Original text
0,Learners,DE_SP_B1_26_13_13_TM,2016,30 / 60,50.0,B1 (lower intermediate),Male,26,Bremen,BiPEB: English Speaking Cultures,...,A1 (lower beginner),Spoken,13,13. Frog,spoken_offline_classroom,,Inside classroom,,One day Tommy found a frog in a forest and bro...,
1,Learners,DE_SP_B1_19_11_13_RN,2016,37 / 60,61.7,B1 (lower intermediate),Female,19,Bremen,"BA English-Speaking Cultures: English, Slavoni...",...,A1 (lower beginner),Spoken,13,13. Frog,spoken_offline_classroom,,Inside classroom,,One day a little boy called John uh with his d...,
2,Learners,DE_SP_B1_21_12_13_SE,2016,37 / 60,61.7,B1 (lower intermediate),Female,21,Bremen,"Teaching Gym, ESC, Romance Studies: French, En...",...,B2 (upper intermediate),Spoken,13,13. Frog,spoken_offline_classroom,,Inside classroom,,One day a boy was sitting in his room / uh he ...,
3,Learners,DE_SP_B1_22_15_13_LF,2016,37 / 60,61.7,B1 (lower intermediate),Female,22,Bremen,BA English Speaking Cultures: English,...,A2 (upper beginner),Spoken,13,13. Frog,spoken_offline_classroom,,Inside classroom,,Uh one day a little boy and his dog are watchi...,
4,Learners,DE_SP_B1_33_10_14_JR,2018,38 / 60,63.3,B1 (lower intermediate),Female,33,Universität Bremen,"English-Speaking Cultures, Philosophie",...,B1 (lower intermediate),Spoken,14,14. Chaplin,spoken_offline_lab,,,,Ok this story is about toch uh Charles Chaplin...,


In [4]:
Lraw.describe() # summary statistics

Unnamed: 0,Year data collection,Placement test score (%),Age,Age of exposure to English,Years studying English,Stay abroad (months),Task number,Minutes taken to complete the task,Original text
count,351.0,351.0,351.0,351.0,350.0,349.0,351.0,176.0,0.0
mean,2017.774929,76.283761,21.997151,8.039886,12.75,4.545272,12.649573,31.840909,
std,1.437477,16.461987,5.707388,3.913143,3.756704,11.132887,3.105708,78.686631,
min,2016.0,15.0,17.0,0.0,1.0,0.0,2.0,2.0,
25%,2016.0,66.7,19.0,6.0,11.0,0.0,13.0,10.0,
50%,2018.0,80.0,21.0,8.0,12.0,0.0,14.0,15.0,
75%,2019.0,88.3,23.0,10.0,15.0,6.0,14.0,25.0,
max,2021.0,100.0,61.0,45.0,49.0,114.0,14.0,600.0,


It appears that the mean age of learners is 21. The mean years of studying English is 12.75, which might be quite high for analysis of morpheme acquisition that is comparable to a child's in their L1. This analysis may have to focus specifically on learners in their first few years of study, or compile some additional learner data from another spoken and transcribed corpus.

In [5]:
Lraw.columns # displaying a list of all the columns

Index(['Subcorpus', 'Filename', 'Year data collection',
       'Placement test score (raw)', 'Placement test score (%)', 'Proficiency',
       'Sex', 'Age', 'School/University/Institution', 'Major',
       'Year at university/school', 'L1', 'Father's native language',
       'Mother's native language', 'Languages spoken at home',
       'Age of exposure to English', 'Years studying English',
       'Stay abroad in English speaking country (>= 1 month)',
       'Stay abroad (where)', 'Stay abroad (when)', 'Stay abroad (months)',
       'Language certificates (type and level)',
       'Proficiency (self-assessment) speaking',
       'Proficiency (self-assessment) listening',
       'Proficiency (self-assessment) reading',
       'Proficiency (self-assessment) writing',
       'Proficiency (self-assessment)', 'Additional foreign language(s)',
       'Proficiency (self-assessment) in additional language speaking',
       'Proficiency (self-assessment) in additional language listening',
   

There's a lot of metadata and information that isn't necessary for the purposes of this project bloating the dataframe, so it will be tidied a bit.

### Cleaning (Learner)

In [6]:
# selecting columns that are needed
Lcorpus= Lraw[['Filename', 'Proficiency', 'Age', 'L1',
              'Age of exposure to English', 'Years studying English', 'Text']]

In [7]:
Lcorpus.rename(columns={'Age of exposure to English':'Age_Exposure', 'Years studying English':'Years_Study',
                      'Task number':'Task'}, inplace=True) # renaming columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Lcorpus.rename(columns={'Age of exposure to English':'Age_Exposure', 'Years studying English':'Years_Study',


In [8]:
Lcorpus.head()

Unnamed: 0,Filename,Proficiency,Age,L1,Age_Exposure,Years_Study,Text
0,DE_SP_B1_26_13_13_TM,B1 (lower intermediate),26,German,8,13.0,One day Tommy found a frog in a forest and bro...
1,DE_SP_B1_19_11_13_RN,B1 (lower intermediate),19,German,10,11.0,One day a little boy called John uh with his d...
2,DE_SP_B1_21_12_13_SE,B1 (lower intermediate),21,German,9,12.0,One day a boy was sitting in his room / uh he ...
3,DE_SP_B1_22_15_13_LF,B1 (lower intermediate),22,German,7,15.0,Uh one day a little boy and his dog are watchi...
4,DE_SP_B1_33_10_14_JR,B1 (lower intermediate),33,German,10,10.0,Ok this story is about toch uh Charles Chaplin...


In [9]:
Lcorpus.info() # displaying info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Filename      351 non-null    object 
 1   Proficiency   351 non-null    object 
 2   Age           351 non-null    int64  
 3   L1            351 non-null    object 
 4   Age_Exposure  351 non-null    int64  
 5   Years_Study   350 non-null    float64
 6   Text          351 non-null    object 
dtypes: float64(1), int64(2), object(4)
memory usage: 19.3+ KB


There seems to be a column with a null value for `Years_Study`. Let's investigate. 

In [10]:
Lcorpus[Lcorpus.Years_Study.isnull()]

Unnamed: 0,Filename,Proficiency,Age,L1,Age_Exposure,Years_Study,Text
222,ES_SP_B1_27_Unknown_13_ESH,B1 (lower intermediate),27,Spanish,0,,uh in the first picture uh / a kid and a dog u...


There isn't much to be done, since we can't make up a data point for the years this person has studied English. Unfortunately this entry will have to be dropped from the data frame. 

In [11]:
Lcorpus= Lcorpus.drop(222, axis=0)

In [12]:
Lcorpus.info() # no null values, looking good

<class 'pandas.core.frame.DataFrame'>
Int64Index: 350 entries, 0 to 350
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Filename      350 non-null    object 
 1   Proficiency   350 non-null    object 
 2   Age           350 non-null    int64  
 3   L1            350 non-null    object 
 4   Age_Exposure  350 non-null    int64  
 5   Years_Study   350 non-null    float64
 6   Text          350 non-null    object 
dtypes: float64(1), int64(2), object(4)
memory usage: 21.9+ KB


In [13]:
# cleaning up Proficiency column to just keep reference level
Lcorpus['Proficiency'] = Lcorpus['Proficiency'].str.replace(r"\(.*\)","")

  Lcorpus['Proficiency'] = Lcorpus['Proficiency'].str.replace(r"\(.*\)","")


In [14]:
Lcorpus.head()

Unnamed: 0,Filename,Proficiency,Age,L1,Age_Exposure,Years_Study,Text
0,DE_SP_B1_26_13_13_TM,B1,26,German,8,13.0,One day Tommy found a frog in a forest and bro...
1,DE_SP_B1_19_11_13_RN,B1,19,German,10,11.0,One day a little boy called John uh with his d...
2,DE_SP_B1_21_12_13_SE,B1,21,German,9,12.0,One day a boy was sitting in his room / uh he ...
3,DE_SP_B1_22_15_13_LF,B1,22,German,7,15.0,Uh one day a little boy and his dog are watchi...
4,DE_SP_B1_33_10_14_JR,B1,33,German,10,10.0,Ok this story is about toch uh Charles Chaplin...


## Native corpus

I will utilize the "Frog Story Corpora", which consists of recorded narratives of 12 different native English speakers telling a wordless "frog story" from a picture book. Each speaker was recorded at several different age levels (3, 4, 5, 9, and 20). 

### Data import (Native)

The CHILDES corpus has its own annotation and analysis format, the `CHAT` format, and a dedicated Python library for language acquisiton research utilizing their database. First, installing `PyLangAcq`. 

In [15]:
pip install --upgrade pylangacq 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import pylangacq

Importing a sample of the data using `PyLangAcq`.

In [17]:
path = '../data/CHILDES/English-Slobin/'
age_03 = pylangacq.read_chat(path, '03') # creating a reader object from a CHAT file

Accessing the information stored in the `CHAT` files.

In [18]:
print(type(age_03))
print('Number of files:', age_03.n_files()) # some basic information about this object

<class 'pylangacq.chat.Reader'>
Number of files: 12


In [19]:
cp.pprint(age_03.headers()[0]) # accessing the metadata stored for the first CHAT file

{'G': '15',
 'Languages': ['eng'],
 'PID': '11312/c-00020713-1',
 'Participants': {'CHI': {'age': '3;01.',
                          'corpus': 'English-Slobin',
                          'custom': '',
                          'education': '',
                          'group': '',
                          'language': 'eng',
                          'name': 'Target_Child',
                          'role': 'Target_Child',
                          'ses': '',
                          'sex': ''}},
 'Types': 'cross, narrative, TD',
 'UTF8': ''}


In [20]:
age_03.headers()[0]['Participants']['CHI']['age'] # accesing nested dictionary information using keys

'3;01.'

In [30]:
age_03.tokens()[:20] # previewing the annotation stored in each CHAT file using the .tokens() method

[Token(word='.', pos=None, mor=None, gra=None),
 Token(word='when', pos='conj', mor='when', gra=Gra(dep=1, head=4, rel='LINK')),
 Token(word="he's", pos='pro:sub', mor='he', gra=Gra(dep=2, head=4, rel='SUBJ')),
 Token(word='CLITIC', pos='aux', mor='be&3S', gra=Gra(dep=3, head=4, rel='AUX')),
 Token(word='sleeping', pos='part', mor='sleep-PRESP', gra=Gra(dep=4, head=0, rel='ROOT')),
 Token(word=',', pos='cm', mor='cm', gra=Gra(dep=5, head=4, rel='LP')),
 Token(word='.', pos='.', mor='', gra=Gra(dep=6, head=4, rel='PUNCT')),
 Token(word='and', pos='coord', mor='and', gra=Gra(dep=1, head=4, rel='LINK')),
 Token(word='his', pos='det:poss', mor='his', gra=Gra(dep=2, head=3, rel='DET')),
 Token(word='frog', pos='n', mor='frog', gra=Gra(dep=3, head=4, rel='SUBJ')),
 Token(word='getting', pos='n:gerund', mor='get-PRESP', gra=Gra(dep=4, head=0, rel='ROOT')),
 Token(word='out', pos='prep', mor='out', gra=Gra(dep=5, head=4, rel='JCT')),
 Token(word='!', pos='!', mor='', gra=Gra(dep=6, head=4, rel

Now, compiling all of the files in the corpora into a dataframe. 

In [22]:
# initiating empty lists
file_path_list = []
participant_list = []
age_list = []
tokens_list = []

# read entire corpus into a Reader object: 
corpus = pylangacq.Reader.from_dir('../data/CHILDES/English-Slobin/')
for f in corpus:
    file_path = f.file_paths()[0].split('/')[4]
    participant = f.headers()[0]['PID']
    age = f.headers()[0]['Participants']['CHI']['age']
    tokens = f.tokens()
    # appending values to lists
    file_path_list.append(file_path)
    participant_list.append(participant)
    age_list.append(age)
    tokens_list.append(tokens)

In [23]:
# building the dataframe
Ncorpus = pd.DataFrame({'Filename':file_path_list,
                       'Participant':participant_list,
                       'Age':age_list,
                       'Tokens':tokens_list
                       })

In [24]:
Ncorpus.head()

Unnamed: 0,Filename,Participant,Age,Tokens
0,03\03a.cha,11312/c-00020713-1,3;01.,"[Token(word='.', pos=None, mor=None, gra=None)..."
1,03\03b.cha,11312/c-00020714-1,3;04.,"[Token(word=""they're"", pos='pro:sub', mor='the..."
2,03\03c.cha,11312/c-00020715-1,3;04.,"[Token(word=""there's"", pos='pro:exist', mor='t..."
3,03\03d.cha,11312/c-00020716-1,3;05.,"[Token(word='a', pos='det:art', mor='a', gra=G..."
4,03\03e.cha,11312/c-00020717-1,3;08.,"[Token(word='.', pos=None, mor=None, gra=None)..."


In [25]:
Ncorpus.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Filename     59 non-null     object
 1   Participant  59 non-null     object
 2   Age          59 non-null     object
 3   Tokens       59 non-null     object
dtypes: object(4)
memory usage: 2.0+ KB


In [26]:
# tidying up Age column 
Ncorpus['Age'] = Ncorpus['Age'].str.rstrip('\.$')

In [27]:
Ncorpus.head()

Unnamed: 0,Filename,Participant,Age,Tokens
0,03\03a.cha,11312/c-00020713-1,3;01,"[Token(word='.', pos=None, mor=None, gra=None)..."
1,03\03b.cha,11312/c-00020714-1,3;04,"[Token(word=""they're"", pos='pro:sub', mor='the..."
2,03\03c.cha,11312/c-00020715-1,3;04,"[Token(word=""there's"", pos='pro:exist', mor='t..."
3,03\03d.cha,11312/c-00020716-1,3;05,"[Token(word='a', pos='det:art', mor='a', gra=G..."
4,03\03e.cha,11312/c-00020717-1,3;08,"[Token(word='.', pos=None, mor=None, gra=None)..."


## Saving data

In [28]:
pd.to_pickle(Lcorpus, "../data/Lcorpus.pkl")
pd.to_pickle(Ncorpus, "../data/Ncorpus.pkl")

In [29]:
Lcorpus.to_csv(r'../data_samples/Lcorpus_Prog1.csv', header=True)
Ncorpus.to_csv(r'../data_samples/Ncorpus_Prog1.csv', header=True)