# Data preprocessing

Man Ho Wong | March 21, 2022


In [None]:
# Python libraries and packages required
import numpy as np
import pandas as pd
import pickle
import pylangacq
from tqdm import tqdm            # For showing progress bar
import matplotlib.pyplot as plt
plt.style.use('ggplot')          # use ggplot style for figures

# Pretty printing settings
import pprint
cp = pprint.PrettyPrinter(compact=True)


# Unpickle data
data = pickle.load(open('../data/childes/corpus_info.pkl', 'rb'))
search_result = data[0]
data_idx = data[1]

`data_idx` is a `DataFrame` created by [data_curation.ipynb](Data_Science/Child-Vocab-Development/codes/data_curation.ipynb). It contains basic information about the files in the curated dataset. 

In [2]:
data_idx

Unnamed: 0,file_path,corpus,year,participants,name,age_d,age_m,sex,group,ses,mot_edu
0,../data/childes/Bates/Free20/amy.cha,Bates,,"{MOT, CHI}",Target_Child,600,20.0,female,TD,MC,unspecified
1,../data/childes/Bates/Free20/betty.cha,Bates,,"{MOT, CHI}",Betty,600,20.0,female,TD,MC,unspecified
2,../data/childes/Bates/Free20/chuck.cha,Bates,,"{MOT, CHI}",Chuck,600,20.0,male,TD,MC,unspecified
3,../data/childes/Bates/Free20/doug.cha,Bates,,"{MOT, CHI}",Doug,600,20.0,male,TD,MC,unspecified
4,../data/childes/Bates/Free20/ed.cha,Bates,,"{MOT, CHI}",Ed,600,20.0,male,TD,MC,unspecified
...,...,...,...,...,...,...,...,...,...,...,...
2606,../data/childes/VanHouten/Twos/teaching/parkt.cha,VanHouten,,"{MOT, CHI}",Matthew,840,28.0,male,MOT_Older,unspecified,unspecified
2607,../data/childes/VanHouten/Twos/teaching/pricet...,VanHouten,,"{MOT, CHI}",Peter,840,28.0,male,MOT_adolescent,unspecified,unspecified
2608,../data/childes/VanHouten/Twos/teaching/raidt.cha,VanHouten,,"{MOT, CHI}",Tommy,840,28.0,male,MOT_older,unspecified,unspecified
2609,../data/childes/VanHouten/Twos/teaching/riott.cha,VanHouten,,"{MOT, CHI}",Robert,840,28.0,male,MOT_Adolescent,unspecified,unspecified


# Data cleaning and integration

As mentioned in data_curation.ipynb, not all corpora use the same labels for some variables. For example, some corpora use school grade levels to define mother's education, and some use levels of education (e.g. 'college'):

In [15]:
mot_edu_labels = set(data_idx.mot_edu)
cp.pprint(mot_edu_labels)  # Print compactly

{'**', '10', '10 , GED', '102', '11', '11 , GED', '11+', '12', '12+', '13',
 '13+', '14', '15', '16', '6', '7', '8', '9', "Associate's_Degree", 'College',
 'College_Doctoral', 'College_J.D.', "College_Master's", 'High_School_Diploma',
 'MOT_1', 'MOT_2', 'MOT_3', 'Some_College', 'XX', 'almost 12', 'unspecified'}


Other variables with different labels used in different corpora include `group` (child's developmental group) and `ses` (socioeconomic status). I will change the labels for these variables so that all corpora use the same set of labels:

### `mot_edu`

First, let's find out which labels each corpus uses: 

In [34]:
labels_by_corpus = {}

corpus_set = set(data_idx.corpus)
for c in corpus_set:
    labels_by_corpus[c] = set(data_idx.mot_edu[data_idx.corpus==c])
    
cp.pprint(labels_by_corpus)

{'Bates': {'unspecified'},
 'Bernstein': {'unspecified'},
 'Brown': {'unspecified'},
 'Clark': {'unspecified'},
 'Demetras2': {'unspecified'},
 'Gleason': {'unspecified'},
 'HSLLD': {'**', '10', '10 , GED', '102', '11', '11 , GED', '11+', '12', '12+',
           '13', '13+', '14', '15', '16', '6', '7', '8', '9', 'XX', 'almost 12',
           'unspecified'},
 'Hall': {'unspecified'},
 'Hicks': {'unspecified'},
 'Nelson': {'unspecified'},
 'NewmanRatner': {"Associate's_Degree", 'College', 'College_Doctoral',
                  'College_J.D.', "College_Master's", 'High_School_Diploma',
                  'Some_College', 'unspecified'},
 'Post': {'unspecified'},
 'VanHouten': {'MOT_1', 'MOT_3', 'MOT_2', 'unspecified'}}



HSLLD:  
https://childes.talkbank.org/access/Eng-NA/HSLLD.html  
https://childes.talkbank.org/access/Eng-NA/0docs/Beals1993.pdf  
https://childes.talkbank.org/access/Eng-NA/0docs/Beals1995.pdf


NewmanRatner: 
https://childes.talkbank.org/access/Eng-NA/NewmanRatner.html  
https://www.cambridge.org/core/journals/journal-of-child-language/article/input-and-uptake-at-7-months-predicts-toddler-vocabulary-the-role-of-childdirected-speech-and-infant-processing-skills-in-language-development/10B3EE0531F60826E6315380E02B222E

VanHouten
https://childes.talkbank.org/access/Eng-NA/VanHouten.html  
https://childes.talkbank.org/access/Eng-NA/0docs/VanHouten1986.pdf  
Mother’s Education: Maternal educational level. 1 = completed junior high, 2 = completed high school, 3 = some post-secondary education



In [31]:
data_idx.group.unique()
data_idx.ses.unique()


{'**',
 '10',
 '10 , GED',
 '102',
 '11',
 '11 , GED',
 '11+',
 '12',
 '12+',
 '13',
 '13+',
 '14',
 '15',
 '16',
 '6',
 '7',
 '8',
 '9',
 'XX',
 'almost 12',
 'unspecified'}

In [28]:
corpus_set


{'Bates',
 'Bernstein',
 'Brown',
 'Clark',
 'Demetras2',
 'Gleason',
 'HSLLD',
 'Hall',
 'Hicks',
 'Nelson',
 'NewmanRatner',
 'Post',
 'VanHouten'}