In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
# obtaining the cleaned data file:
NS = pd.read_csv('Cleaned_NS.csv')

In [3]:
# obtaining columns of interest:
children_columns = [col for col in NS.columns if col.startswith('participant_ID') or ('children' in col)]

In [4]:
# creating the dataframe:
NS_children = NS[children_columns]

In [5]:
# let's find out more about this dataset:
NS_children.describe(include = 'all')

Unnamed: 0,participant_ID,have_children,children_number_under_3,children_number_3-5,children_number_6-12,children_number_13-17
count,4054,3202,2053,2078,2081,1985
unique,4054,3,35,29,30,32
top,ns 757,yes,0,1,1,0
freq,1,2331,948,829,876,999


In [6]:
# why are there huge number of unique values for the number of children?
NS_children.children_number_under_3.unique()

array([nan, 'none', '0', '1', '2', 'one', 'no child', 'non', 'two', '3',
       'no one', 'no', 'noone', '4', '22', '6', 'zero', 'o',
       "i'm currently pregnant", '-', '5', 'n/a.', 'n0ne',
       '0 adult children with disabilities', 'no children under 3', 'h',
       's', 'nill', '15', '/', 'i don’t have', '01', 'nothing to report',
       '2 children', '2m4', 'nothing'], dtype=object)

In [7]:
NS_children['children_number_3-5'].unique()

array([nan, 'none', '0', '1', '2', 'one', 'no', 'no one', 'non', '3',
       'nil', 'two', 'o', '-', 'zero', 'n/a.', '6', '1 children', 'h',
       'nill', '1p', 'yes', '/', '4', 'i don’t have', '01', '1 child',
       'my grandson', '1$', '02'], dtype=object)

In [8]:
NS_children['children_number_6-12'].unique()

array([nan, 'none', '1', '2', '0', '4', 'one', '3', '6', 'no', 'non',
       'two', 'nil', ',0', 'zero', 'two.', '5', 'no there', 'o',
       'no children age 6-12', 'h', 'no one', 'or', '12', 'i don’t have',
       '20', '01', '2 children', '00', '03', 'nothing'], dtype=object)

In [9]:
NS_children['children_number_13-17'].unique()

array([nan, '1', '0', '2', '3', 'none', 'one', 'no child', '4', 'no one',
       'no', '6', 'three', 'nil', '5', 'zero', 'two', 'o', '-', 'n/a.',
       '10', 'yes one', 'yes', '3 children', 'h', 'j', 'non', '/', '7',
       'i don’t have', '00', 'nothing to report', 'nothing'], dtype=object)

In [10]:
# Combine unique answers from all columns into a single list
unique_answers = pd.unique(NS_children[['children_number_under_3', 
                                        'children_number_3-5', 
                                        'children_number_6-12', 
                                        'children_number_13-17']].values.ravel())

In [11]:
unique_answers

array([nan, 'none', '1', '0', '2', '3', '4', 'one', 'no child', 'non',
       'two', '6', 'no', 'no one', 'noone', '22', 'three', 'nil', '5',
       'zero', 'o', "i'm currently pregnant", '-', ',0', 'n/a.', 'two.',
       'n0ne', '10', 'no there', 'yes one', 'yes',
       '0 adult children with disabilities', 'no children under 3',
       '1 children', 'no children age 6-12', '3 children', 'h', 'j', 's',
       'nill', '1p', '15', 'or', '/', '12', '7', 'i don’t have', '20',
       '00', '01', 'nothing to report', '2 children', '1 child',
       'my grandson', '1$', '2m4', '02', '03', 'nothing'], dtype=object)

I'll be making a function to map the provided answers to acceptable answers.

In [12]:
def check_number(value):
    
    numbers_words = {'zero': 0,
                      'one': 1,
                      'two' : 2,
                      'three': 3,
                      'four': 4,
                      'five': 5,
                      'six' : 6,
                      'seven' : 7,
                      'grandson': 1
                    }
    mapping_update = {}
    
    check_data = ['o','s','j','h','or', '1$','1p' , '2m4']
    
    try:
        value = int(value)
        
        if value < 10:
            mapping_update.update({x : value})
        else:
            mapping_update.update({x : 'check data quality'})
            
    except:
      # print(f'{x} cannot be an int')
        if value in numbers_words.keys(): 
            mapping_update.update({x : numbers_words[value]})
        elif value in check_data:
            mapping_update.update({x : 'check data quality'})
        else:
            mapping_update.update({x : 0})            
            
    return mapping_update

In [13]:
mapping = {}
for x in unique_answers:
    if type(x) == float:
        continue
 
    else:
        delete = ',/.- '
        y = x.strip(delete)
        y = y.split()
        
        if len(y) > 0 and len(y) < 3:
            if y[0] == 'yes' or y[0] == 'my':
                try:
                    mapping_update = check_number(y[1])
                except:
                    mapping_update = {y[0] : 'at least 1'}
            else:
                mapping_update = check_number(y[0])
            
            mapping.update(mapping_update)

        else:
            mapping.update({x : 0})   
        
        if x not in mapping.keys():
            print(x)
            
print(mapping)

{'none': 0, '1': 1, '0': 0, '2': 2, '3': 3, '4': 4, 'one': 1, 'no child': 0, 'non': 0, 'two': 2, '6': 6, 'no': 0, 'no one': 0, 'noone': 0, '22': 'check data quality', 'three': 3, 'nil': 0, '5': 5, 'zero': 0, 'o': 'check data quality', "i'm currently pregnant": 0, '-': 0, ',0': 0, 'n/a.': 0, 'two.': 2, 'n0ne': 0, '10': 'check data quality', 'no there': 0, 'yes one': 1, 'yes': 'at least 1', '0 adult children with disabilities': 0, 'no children under 3': 0, '1 children': 1, 'no children age 6-12': 0, '3 children': 3, 'h': 'check data quality', 'j': 'check data quality', 's': 'check data quality', 'nill': 0, '1p': 'check data quality', '15': 'check data quality', 'or': 'check data quality', '/': 0, '12': 'check data quality', '7': 7, 'i don’t have': 0, '20': 'check data quality', '00': 0, '01': 1, 'nothing to report': 0, '2 children': 2, '1 child': 1, 'my grandson': 1, '1$': 'check data quality', '2m4': 'check data quality', '02': 2, '03': 3, 'nothing': 0}


In [14]:
# Apply mapping to specific columns
columns_to_map = ['children_number_under_3', 
             'children_number_3-5',
             'children_number_6-12',
             'children_number_13-17']

for col in columns_to_map:
     NS_children[col] =  NS_children[col].map(mapping)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children[col] =  NS_children[col].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children[col] =  NS_children[col].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children[col] =  NS_children[col].map(mapping)
A value is trying to be set on a copy of a slice from a

In [15]:
NS_children.describe(include='all')

Unnamed: 0,participant_ID,have_children,children_number_under_3,children_number_3-5,children_number_6-12,children_number_13-17
count,4054,3202,2053,2078,2081,1985
unique,4054,3,8,8,8,10
top,ns 757,yes,0,0,1,0
freq,1,2331,1133,968,896,1118


In [16]:
# From the above, let's see what the answers were to each question:
NS_children.children_number_under_3.unique()

array([nan, 0, 1, 2, 3, 4, 'check data quality', 6, 5], dtype=object)

In [17]:
NS_children['children_number_3-5'].unique()

array([nan, 0, 1, 2, 3, 'check data quality', 6, 'at least 1', 4],
      dtype=object)

In [18]:
NS_children['children_number_6-12'].unique()

array([nan, 0, 1, 2, 4, 3, 6, 5, 'check data quality'], dtype=object)

In [19]:
NS_children['children_number_13-17'].unique()

array([nan, 1, 0, 2, 3, 4, 6, 5, 'check data quality', 'at least 1', 7],
      dtype=object)

In [20]:
# From the above, let's see what the answers were to have_children:
NS_children.have_children.value_counts()

yes                     2331
no                       848
prefer not to answer      23
Name: have_children, dtype: int64

In [21]:
# how many left the questions unanswered?
NS_children.isna().sum()

participant_ID                0
have_children               852
children_number_under_3    2001
children_number_3-5        1976
children_number_6-12       1973
children_number_13-17      2069
dtype: int64

# Replacing the nans:
we'll use the following rule:
* if the person did not answer any of the questions, then the nans in all columns will be replaced with "prefer not to answer".
* if the person answered any of the questions with a number, then the nan will be replaced with "0" for the "number of children" questions, and "yes" or "no" to "have children" question based on total number of children.

In [22]:
# Check all columns are not answered:
prefer_not_to_answer = NS_children[(NS_children['have_children'].isna() == True)
                                   &(NS_children['children_number_under_3'].isna() == True)
                                   &(NS_children['children_number_3-5'].isna() == True)
                                   & (NS_children['children_number_6-12'].isna() == True)
                                   & (NS_children['children_number_13-17'].isna() == True)]

In [23]:
len(prefer_not_to_answer)

848

In [24]:
# replacing all values above with prefer not to answer:
for x in prefer_not_to_answer.index:
    NS_children.loc[x].fillna('prefer not to answer', inplace = True)

In [25]:
# Check all columns are not answered, but have children is answered with no:
no_kids = NS_children[(NS_children['have_children'] == 'no')
                                   &(NS_children['children_number_under_3'].isna() == True)
                                   &(NS_children['children_number_3-5'].isna() == True)
                                   & (NS_children['children_number_6-12'].isna() == True)
                                   & (NS_children['children_number_13-17'].isna() == True)]

In [26]:
len(no_kids)

837

In [27]:
# All the above NaNs in no_kids will be replaced with 0:
for x in no_kids.index:
    NS_children.loc[x].fillna(0, inplace = True)

In [28]:
# Check all columns are not answered, but have children is indicated as 'prefer not to answer':
indicated_prefer_not_to_answer = NS_children[(NS_children['have_children'] == 'prefer not to answer')                                   
                                             &((NS_children['children_number_under_3'].isna() == True)
                                             &(NS_children['children_number_3-5'].isna() == True)
                                             & (NS_children['children_number_6-12'].isna() == True)
                                             & (NS_children['children_number_13-17'].isna() == True))]

In [29]:
len(indicated_prefer_not_to_answer)

23

In [30]:
# for the above NaNs, i'll replace all of them with prefer not to answer:
for x in indicated_prefer_not_to_answer.index:
    NS_children.loc[x].fillna('prefer not to answer', inplace = True)

In [31]:
# Check all columns are not answered but have children is yes:
yes_but_no_details = NS_children[(NS_children['have_children'] == 'yes')                                   
                                             & ((NS_children['children_number_under_3'].isna() == True)
                                             &(NS_children['children_number_3-5'].isna() == True)
                                             & (NS_children['children_number_6-12'].isna() == True)
                                             & (NS_children['children_number_13-17'].isna() == True))]

In [32]:
len(yes_but_no_details)

70

In [33]:
# replacig the nans with "unknown":
for x in yes_but_no_details.index:
    NS_children.loc[x].fillna('unknown', inplace = True)

In [34]:
NS_children.isna().sum()

participant_ID               0
have_children                4
children_number_under_3    223
children_number_3-5        198
children_number_6-12       195
children_number_13-17      291
dtype: int64

In [35]:
# Check if have children is yes and details are provided:
yes_with_details = NS_children[(NS_children['have_children'] == 'yes')                                   
                                             & ((NS_children['children_number_under_3'].isna() == True)
                                             |(NS_children['children_number_3-5'].isna() == True)
                                             | (NS_children['children_number_6-12'].isna() == True)
                                             | (NS_children['children_number_13-17'].isna() == True))]

In [36]:
for x in yes_with_details.index:
    if x not in yes_but_no_details:
        NS_children.loc[x].fillna(0, inplace = True)

In [37]:
NS_children.isna().sum()

participant_ID             0
have_children              4
children_number_under_3    0
children_number_3-5        0
children_number_6-12       0
children_number_13-17      0
dtype: int64

In [38]:
# let's find out more about the null values remaining in the have children column:

In [39]:
NS_children[NS_children.have_children.isna() == True]

Unnamed: 0,participant_ID,have_children,children_number_under_3,children_number_3-5,children_number_6-12,children_number_13-17
1314,ns 1068,,1,3,3,1
3307,ns 3749,,0,0,0,0
3391,ns 3834,,0,0,0,1
4026,ns 4022,,0,2,0,2


I'll need to replace the NaNs in the have_children column based on the sum of total number of children. let's add a column for the total number of children first.

In [40]:
NS_children['children_total'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children['children_total'] = np.nan


In [41]:
# let's find out the total number of children:
for x in NS_children.index:
    if x in prefer_not_to_answer.index or x in indicated_prefer_not_to_answer.index:
        NS_children['children_total'].loc[x] = 'prefer not to answer'
        
    elif x in yes_but_no_details.index:
        NS_children['children_total'].loc[x] = 'unsure'
        
    else:
        try:
            NS_children['children_total'].loc[x] = NS_children[['children_number_under_3','children_number_3-5', 'children_number_6-12', 'children_number_13-17']].loc[x].sum()
            if NS_children['children_total'].loc[x] == 4*'check data quality':
                NS_children['children_total'].loc[x] = 'check data quality'
                           
        except:
            NS_children['children_total'].loc[x] = 'check data quality'
                 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children['children_total'].loc[x] = 'prefer not to answer'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children['children_total'].loc[x] = NS_children[['children_number_under_3','children_number_3-5', 'children_number_6-12', 'children_number_13-17']].loc[x].sum()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children['children_total'].loc[x] = NS_children[['children_number_under_3','children_number_3-5', 'children_number_6-12', 'children_number_13-17']]

In [42]:
NS_children.head()

Unnamed: 0,participant_ID,have_children,children_number_under_3,children_number_3-5,children_number_6-12,children_number_13-17,children_total
0,ns 757,prefer not to answer,prefer not to answer,prefer not to answer,prefer not to answer,prefer not to answer,prefer not to answer
1,ns 318,prefer not to answer,prefer not to answer,prefer not to answer,prefer not to answer,prefer not to answer,prefer not to answer
2,ns 328,no,0,0,0,0,0
3,ns 646,yes,0,0,0,1,1
4,ns 678,no,0,0,0,0,0


In [43]:
NS_children.describe()

Unnamed: 0,participant_ID,have_children,children_number_under_3,children_number_3-5,children_number_6-12,children_number_13-17,children_total
count,4054,4050,4054,4054,4054,4054,4054
unique,4054,3,10,10,10,12,17
top,ns 757,yes,0,0,0,0,0
freq,1,2331,2193,2003,1806,2246,873


In [44]:
NS_children.children_total.value_counts()

0                       873
prefer not to answer    871
2                       733
1                       687
3                       332
4                       221
5                       119
unsure                   70
6                        63
check data quality       26
7                        22
8                        13
9                        11
10                        5
12                        4
13                        3
11                        1
Name: children_total, dtype: int64

before we dive into checking the data quality. let's replace the nans in the have children column based on the total children data:

In [45]:
for x in NS_children[NS_children.have_children.isna() == True].index:
    NS_children['children_total'].loc[x] = NS_children[['children_number_under_3','children_number_3-5', 'children_number_6-12', 'children_number_13-17']].loc[x].sum()

    if NS_children.children_total.loc[x] > 0:
        NS_children.have_children.loc[x] = 'yes'
    else:
        NS_children.have_children.loc[x] = 'no'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children['children_total'].loc[x] = NS_children[['children_number_under_3','children_number_3-5', 'children_number_6-12', 'children_number_13-17']].loc[x].sum()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children.have_children.loc[x] = 'yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children['children_total'].loc[x] = NS_children[['children_number_under_3','children_number_3-5', 'children_number_6-12', 'children_number_13-17']].loc[x].sum()
A value

In [46]:
NS_children.isna().sum()

participant_ID             0
have_children              0
children_number_under_3    0
children_number_3-5        0
children_number_6-12       0
children_number_13-17      0
children_total             0
dtype: int64

# Check data quality:

In [47]:
# Check if have children is answered with no but the number of children is not 0:
no_kids_check = NS_children[(NS_children['have_children'] == 'no')
                                   & (
                                       ((NS_children['children_number_under_3'].isna() == False) & (NS_children['children_number_under_3'] != 0))
                                     | ((NS_children['children_number_3-5'].isna() == False) & (NS_children['children_number_3-5'] != 0))
                                     | ((NS_children['children_number_6-12'].isna() == False) & (NS_children['children_number_6-12'] != 0))
                                     | ((NS_children['children_number_13-17'].isna() == False) & (NS_children['children_number_13-17'] != 0))
                                   )] 

In [48]:
no_kids_check

Unnamed: 0,participant_ID,have_children,children_number_under_3,children_number_3-5,children_number_6-12,children_number_13-17,children_total
3548,ns 3684,no,0,1,1,0,2
3560,ns 3696,no,1,0,0,0,1


In [49]:
# we'll check to see if these clients get CCB to decide whether or not to change the answer to "have children" as yes.
NS.loc[no_kids_check.index][['age','income_source_CCB']]

Unnamed: 0,age,income_source_CCB
3548,31-40,
3560,21-30,


Since CCB was not indicated as a source of income, we'll keep the "have children" as no, but will change the children_total to "check data quality".

In [50]:
for x in no_kids_check.index:
    NS_children.children_total.loc[x] = 'check data quality'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children.children_total.loc[x] = 'check data quality'


In [51]:
# find values where data quality may be a question:
check_quality = NS_children[(NS_children['children_total'] == 'check data quality')]

In [52]:
len(check_quality)

28

In [53]:
# obtain an original copy of the NS_children dataframe:
NS_children_original =  NS[children_columns]

In [54]:
# Check the answers at the indecies where check data quality is indicated:
check_quality_original = NS_children_original.loc[check_quality.index]

In [55]:
# replace the 'o' and '2m4' with 0, and '1p' and '1$' with 1:
for x in check_quality.index:
    for column in check_quality.columns:
        if column == 'children_total':
            try:
                NS_children['children_total'].loc[x] = NS_children[['children_number_under_3','children_number_3-5', 'children_number_6-12', 'children_number_13-17']].loc[x].sum()
            
                if NS_children['children_total'].loc[x] == 4*'check data quality':
                    NS_children['children_total'].loc[x] = 'check data quality'
                
                elif x in no_kids_check.index:
                    NS_children.children_total.loc[x] = 'check data quality'
            
            except:
                NS_children['children_total'].loc[x] = 'check data quality'
                 
        else:
            if check_quality_original[column].loc[x] in ['o', '2m4']:
                NS_children[column].loc[x] = 0
            elif check_quality_original[column].loc[x] in ['1p', '1$']:
                NS_children[column].loc[x] = 1

print('done quality check')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children['children_total'].loc[x] = 'check data quality'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children[column].loc[x] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children['children_total'].loc[x] = NS_children[['children_number_under_3','children_number_3-5', 'children_number_6-12', 'children_number_13-17']].loc[x].sum()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.p

done quality check


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children['children_total'].loc[x] = NS_children[['children_number_under_3','children_number_3-5', 'children_number_6-12', 'children_number_13-17']].loc[x].sum()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children[column].loc[x] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS_children['children_total'].loc[x] = NS_children[['children_number_under_3','children_number_3-5', 'children_number_6-12', 'children_number_13-17']].loc[x].sum()
A value is trying

In [56]:
second_quality_check = NS_children[NS_children.children_total == 'check data quality']

In [57]:
len(second_quality_check)

12

In [58]:
NS_children.describe()

Unnamed: 0,participant_ID,have_children,children_number_under_3,children_number_3-5,children_number_6-12,children_number_13-17,children_total
count,4054,4054,4054,4054,4054,4054,4054
unique,4054,3,10,10,10,12,17
top,ns 757,yes,0,0,0,0,0
freq,1,2334,2203,2008,1809,2250,876


In [59]:
NS_children.isna().sum()

participant_ID             0
have_children              0
children_number_under_3    0
children_number_3-5        0
children_number_6-12       0
children_number_13-17      0
children_total             0
dtype: int64

In [60]:
# Saving the final children dataframe as a csv file.
NS_children.to_csv('NS_children_final.csv', index = False)

In [1]:
NS_children

NameError: name 'NS_children' is not defined