# Data preprocessing

Man Ho Wong | March 21, 2022


In [1]:
# Python libraries and packages required
import numpy as np
import pandas as pd
import pickle
import pylangacq
from tqdm import tqdm            # For showing progress bar
import matplotlib.pyplot as plt
plt.style.use('ggplot')          # use ggplot style for figures

# Pretty printing settings
import pprint
cp = pprint.PrettyPrinter(compact=True)

# Unpickle data
data = pickle.load(open('../data/childes/corpus_info.pkl', 'rb'))
data_idx = data[1]

`data_idx` is a `DataFrame` created by [data_curation.ipynb](Data_Science/Child-Vocab-Development/codes/data_curation.ipynb). It contains basic information about the files in the curated dataset. 

In [2]:
data_idx

Unnamed: 0,file_path,corpus,year,participants,name,age_d,age_m,sex,group,ses,mot_edu
0,../data/childes/Bates/Free20/amy.cha,Bates,,"{CHI, MOT}",Target_Child,600,20.0,female,TD,MC,unspecified
1,../data/childes/Bates/Free20/betty.cha,Bates,,"{CHI, MOT}",Betty,600,20.0,female,TD,MC,unspecified
2,../data/childes/Bates/Free20/chuck.cha,Bates,,"{CHI, MOT}",Chuck,600,20.0,male,TD,MC,unspecified
3,../data/childes/Bates/Free20/doug.cha,Bates,,"{CHI, MOT}",Doug,600,20.0,male,TD,MC,unspecified
4,../data/childes/Bates/Free20/ed.cha,Bates,,"{CHI, MOT}",Ed,600,20.0,male,TD,MC,unspecified
...,...,...,...,...,...,...,...,...,...,...,...
2606,../data/childes/VanHouten/Twos/teaching/parkt.cha,VanHouten,,"{CHI, MOT}",Matthew,840,28.0,male,MOT_Older,unspecified,unspecified
2607,../data/childes/VanHouten/Twos/teaching/pricet...,VanHouten,,"{CHI, MOT}",Peter,840,28.0,male,MOT_adolescent,unspecified,unspecified
2608,../data/childes/VanHouten/Twos/teaching/raidt.cha,VanHouten,,"{CHI, MOT}",Tommy,840,28.0,male,MOT_older,unspecified,unspecified
2609,../data/childes/VanHouten/Twos/teaching/riott.cha,VanHouten,,"{CHI, MOT}",Robert,840,28.0,male,MOT_Adolescent,unspecified,unspecified


# Data cleaning and integration

As mentioned in data_curation.ipynb, not all corpora use the same labels for some variables. For example, some corpora use school grade levels to define mother's education, and some use levels of education (e.g. 'college'):

In [3]:
mot_edu_labels = set(data_idx.mot_edu)
cp.pprint(mot_edu_labels)  # Print compactly

{'**', '10', '10 , GED', '102', '11', '11 , GED', '11+', '12', '12+', '13',
 '13+', '14', '15', '16', '6', '7', '8', '9', "Associate's_Degree", 'College',
 'College_Doctoral', 'College_J.D.', "College_Master's", 'High_School_Diploma',
 'MOT_1', 'MOT_2', 'MOT_3', 'Some_College', 'XX', 'almost 12', 'unspecified'}


Other variables with different labels used in different corpora include `group` (child's developmental group) and `ses` (socioeconomic status). I will change the labels for these variables so that all corpora use the same set of labels:

### `mot_edu`

First, let's find out which labels each corpus uses: 

In [4]:
def check_labels(var):
    labels_by_corpus = {}
    corpus_set = set(data_idx.corpus)
    for c in corpus_set:
        labels_by_corpus[c] = set(data_idx[var][data_idx.corpus==c])
    return labels_by_corpus
    
cp.pprint(check_labels('mot_edu'))

{'Bates': {'unspecified'},
 'Bernstein': {'unspecified'},
 'Brown': {'unspecified'},
 'Clark': {'unspecified'},
 'Demetras2': {'unspecified'},
 'Gleason': {'unspecified'},
 'HSLLD': {'**', '10', '10 , GED', '102', '11', '11 , GED', '11+', '12', '12+',
           '13', '13+', '14', '15', '16', '6', '7', '8', '9', 'XX', 'almost 12',
           'unspecified'},
 'Hall': {'unspecified'},
 'Hicks': {'unspecified'},
 'Nelson': {'unspecified'},
 'NewmanRatner': {"Associate's_Degree", 'College', 'College_Doctoral',
                  'College_J.D.', "College_Master's", 'High_School_Diploma',
                  'Some_College', 'unspecified'},
 'Post': {'unspecified'},
 'VanHouten': {'unspecified', 'MOT_2', 'MOT_1', 'MOT_3'}}


Check label definitions on corpus' homepage:

In [5]:
corpus_homepages = {}
corpus_set = set(data_idx.corpus)

for c in corpus_set:
    url = "https://childes.talkbank.org/access/Eng-NA/" + c + ".html"
    corpus_homepages[c] = url
corpus_homepages

{'Bernstein': 'https://childes.talkbank.org/access/Eng-NA/Bernstein.html',
 'Nelson': 'https://childes.talkbank.org/access/Eng-NA/Nelson.html',
 'Post': 'https://childes.talkbank.org/access/Eng-NA/Post.html',
 'Clark': 'https://childes.talkbank.org/access/Eng-NA/Clark.html',
 'Bates': 'https://childes.talkbank.org/access/Eng-NA/Bates.html',
 'NewmanRatner': 'https://childes.talkbank.org/access/Eng-NA/NewmanRatner.html',
 'Gleason': 'https://childes.talkbank.org/access/Eng-NA/Gleason.html',
 'Hall': 'https://childes.talkbank.org/access/Eng-NA/Hall.html',
 'HSLLD': 'https://childes.talkbank.org/access/Eng-NA/HSLLD.html',
 'VanHouten': 'https://childes.talkbank.org/access/Eng-NA/VanHouten.html',
 'Demetras2': 'https://childes.talkbank.org/access/Eng-NA/Demetras2.html',
 'Brown': 'https://childes.talkbank.org/access/Eng-NA/Brown.html',
 'Hicks': 'https://childes.talkbank.org/access/Eng-NA/Hicks.html'}

#### Corpus info:

HSLLD:  
https://childes.talkbank.org/access/Eng-NA/HSLLD.html  
https://childes.talkbank.org/access/Eng-NA/0docs/Beals1993.pdf  
https://childes.talkbank.org/access/Eng-NA/0docs/Beals1995.pdf


NewmanRatner: 
https://childes.talkbank.org/access/Eng-NA/NewmanRatner.html  
https://www.cambridge.org/core/journals/journal-of-child-language/article/input-and-uptake-at-7-months-predicts-toddler-vocabulary-the-role-of-childdirected-speech-and-infant-processing-skills-in-language-development/10B3EE0531F60826E6315380E02B222E

VanHouten
https://childes.talkbank.org/access/Eng-NA/VanHouten.html  
https://childes.talkbank.org/access/Eng-NA/0docs/VanHouten1986.pdf  
Mother’s Education: Maternal educational level. 1 = completed junior high, 2 = completed high school, 3 = some post-secondary education

New labels:  
JH-  
HS-  
HS  
HS+  

AD  
UG-  
UG  

MS  
JD  
DR  


Map `mot_edu` to new labels and update `data_idx` with new labels:

In [6]:
#' mot_edu' label mapping
EDU_DICT = {
          'JH-':['6','7','8'],
          'HS-':['9','10','11','11+','almost 12','MOT_1'],
          'HS':['10 , GED','11 , GED','12','12+','High_School_Diploma','MOT_2'],
          'HS+':['13','13+','14','15','16','MOT_3'],
          'AD':["Associate's_Degree"],
          'UG-':['Some_College'],
          'UG':['College'], 
          'MS':["College_Master's"],
          'JD':['College_J.D.'],    
          'DR':['College_Doctoral'],
          'unspecified':['**','102','XX','unspecified']
           }

# Function to update 'mot_edu'
def update_mot_edu(data):
    for label in EDU_DICT:
        if data in EDU_DICT[label]:
            return label
        else: return data  # do not update label if not found in 'edu_dict'

# Update 'mot_edu'
data_idx.mot_edu = data_idx.mot_edu.map(update_mot_edu)

# Check if update was successful
cp.pprint(check_labels('mot_edu'))

{'Bates': {'unspecified'},
 'Bernstein': {'unspecified'},
 'Brown': {'unspecified'},
 'Clark': {'unspecified'},
 'Demetras2': {'unspecified'},
 'Gleason': {'unspecified'},
 'HSLLD': {'**', '10', '10 , GED', '102', '11', '11 , GED', '11+', '12', '12+',
           '13', '13+', '14', '15', '16', '9', 'JH-', 'XX', 'almost 12',
           'unspecified'},
 'Hall': {'unspecified'},
 'Hicks': {'unspecified'},
 'Nelson': {'unspecified'},
 'NewmanRatner': {"Associate's_Degree", 'College', 'College_Doctoral',
                  'College_J.D.', "College_Master's", 'High_School_Diploma',
                  'Some_College', 'unspecified'},
 'Post': {'unspecified'},
 'VanHouten': {'unspecified', 'MOT_2', 'MOT_1', 'MOT_3'}}


### `group`

In [7]:
cp.pprint(check_labels('group'))

{'Bates': {'TD'},
 'Bernstein': {'unspecified', 'TD'},
 'Brown': {'unspecified', 'TD'},
 'Clark': {'TD'},
 'Demetras2': {'unspecified', 'TD'},
 'Gleason': {'unspecified', 'typical', 'normal', 'TD'},
 'HSLLD': {'unspecified'},
 'Hall': {'unspecified', 'White,UC', 'TD'},
 'Hicks': {'unspecified'},
 'Nelson': {'unspecified'},
 'NewmanRatner': {'TD'},
 'Post': {'TD'},
 'VanHouten': {'MOT_Adolescent', 'MOT_Adolescent_', 'MOT_Older', 'MOT_Older_',
               'MOT_adolescent', 'MOT_older', 'TD', 'unspecified'}}


### `ses`

In [8]:
cp.pprint(check_labels('ses'))

{'Bates': {'MC'},
 'Bernstein': {'unspecified', 'MC'},
 'Brown': {'unspecified', 'MC'},
 'Clark': {'UC'},
 'Demetras2': {'WC', 'unspecified'},
 'Gleason': {'unspecified', 'MC'},
 'HSLLD': {'unspecified'},
 'Hall': {'unspecified', 'White,UC', 'Black,WC', 'Black,UC', 'White,WC'},
 'Hicks': {'LI', 'unspecified'},
 'Nelson': {'unspecified', 'MC'},
 'NewmanRatner': {'unspecified'},
 'Post': {'WC'},
 'VanHouten': {'unspecified'}}
