In this notebook, I will be carrying an initial cleaning of the Neighbour Survey results:

In [1]:
import numpy as np
import pandas as pd

In [2]:
# importing data:
NS = pd.read_csv('neighbour_survey_clean-2024-06-14.csv')

In [3]:
NS.head()

Unnamed: 0,id,q001,q002,q003,q004,q005,q006,q007,q008,q009,...,q041m,q041n,q041o,q041p,q041q,q041r,q041s,q041t,q041u,q042
0,NS 757,English,No,Often true,Often true,Yes,Almost every month,Yes,Yes,More than 10 years ago,...,,,,,,,,,,There should be some accountability for places...
1,NS 318,English,No,Sometimes true,Often true,Yes,Almost every month,Yes,Yes,5 to 10 years ago,...,,Private pension,,,Short/long term disability,,,,,
2,NS 328,English,No,Sometimes true,Sometimes true,No,,Yes,Don't know,5 to 10 years ago,...,Canadian Pension Plan (CPP),,,,,,,,,Not at the moment
3,NS 646,English,No,Often true,Often true,Yes,Some months but not every month,Yes,Yes,Less than 1 year ago,...,,,,,,,,,,No
4,NS 678,English,Prefer not to answer,Sometimes true,Sometimes true,Yes,Some months but not every month,Prefer not to answer,Yes,3 to 4 years ago,...,,,Old Age Security (OAS),,,,,,,Keep up the good work yhat u do


In [4]:
# I will make all columns with strings (object) lower case, and will remove spaces before and after strings.
for column in NS.select_dtypes(include=['object']).columns:
    NS[column] = NS[column].str.lower()
    NS[column] = NS[column].str.lstrip().str.rstrip()

In [5]:
NS.head()

Unnamed: 0,id,q001,q002,q003,q004,q005,q006,q007,q008,q009,...,q041m,q041n,q041o,q041p,q041q,q041r,q041s,q041t,q041u,q042
0,ns 757,english,no,often true,often true,yes,almost every month,yes,yes,more than 10 years ago,...,,,,,,,,,,there should be some accountability for places...
1,ns 318,english,no,sometimes true,often true,yes,almost every month,yes,yes,5 to 10 years ago,...,,private pension,,,short/long term disability,,,,,
2,ns 328,english,no,sometimes true,sometimes true,no,,yes,don't know,5 to 10 years ago,...,canadian pension plan (cpp),,,,,,,,,not at the moment
3,ns 646,english,no,often true,often true,yes,some months but not every month,yes,yes,less than 1 year ago,...,,,,,,,,,,no
4,ns 678,english,prefer not to answer,sometimes true,sometimes true,yes,some months but not every month,prefer not to answer,yes,3 to 4 years ago,...,,,old age security (oas),,,,,,,keep up the good work yhat u do


In [6]:
# to find out how many data points we have:
NS.describe()

Unnamed: 0,id,q001,q002,q003,q004,q005,q006,q007,q008,q009,...,q041m,q041n,q041o,q041p,q041q,q041r,q041s,q041t,q041u,q042
count,4057,4054,4036,4026,4027,4040,2869,4018,4019,4037,...,99,41,81,28,32,88,220,192,67,2221
unique,4051,4,3,5,5,4,6,4,4,6,...,1,1,1,1,1,1,1,1,60,658
top,ns,english,no,sometimes true,sometimes true,yes,some months but not every month,yes,yes,1 to 2 years ago,...,canadian pension plan (cpp),private pension,old age security (oas),workplace safety and insurance board (wsib),short/long term disability,other government programs,no income,prefer not to answer,self employed,no
freq,3,3387,2874,1931,1907,2902,1247,2965,2508,1148,...,99,41,81,28,32,88,220,192,5,883


In [7]:
# I will remove duplicates:
NS.drop_duplicates(inplace = True)

In [8]:
NS.describe()

Unnamed: 0,id,q001,q002,q003,q004,q005,q006,q007,q008,q009,...,q041m,q041n,q041o,q041p,q041q,q041r,q041s,q041t,q041u,q042
count,4055,4054,4036,4026,4027,4040,2869,4018,4019,4037,...,99,41,81,28,32,88,220,192,67,2221
unique,4051,4,3,5,5,4,6,4,4,6,...,1,1,1,1,1,1,1,1,60,658
top,ns 3980,english,no,sometimes true,sometimes true,yes,some months but not every month,yes,yes,1 to 2 years ago,...,canadian pension plan (cpp),private pension,old age security (oas),workplace safety and insurance board (wsib),short/long term disability,other government programs,no income,prefer not to answer,self employed,no
freq,2,3387,2874,1931,1907,2902,1247,2965,2508,1148,...,99,41,81,28,32,88,220,192,5,883


Why do we have only 4051 unique IDs?? let's check them out!

In [9]:
# Identify duplicate IDs
duplicate_ids = NS[NS.duplicated(subset='id', keep=False)]

# Display the rows with duplicate IDs
duplicate_ids

Unnamed: 0,id,q001,q002,q003,q004,q005,q006,q007,q008,q009,...,q041m,q041n,q041o,q041p,q041q,q041r,q041s,q041t,q041u,q042
32,ns 3980,english,yes,sometimes true,sometimes true,yes,some months but not every month,yes,yes,more than 10 years ago,...,,,,,,,,,,
95,ns 3983,english,no,often true,often true,yes,almost every month,yes,yes,3 to 4 years ago,...,,,,,,,,,,thank you for the food support
2122,ns 3981,english,no,often true,often true,yes,almost every month,yes,yes,3 to 4 years ago,...,,,,,,,,,,thank you
2140,ns 3982,english,no,sometimes true,often true,yes,almost every month,yes,yes,1 to 2 years ago,...,canadian pension plan (cpp),private pension,old age security (oas),,,,,,,your help is appreciated thanks fpr help
3973,ns 3980,english,yes,sometimes true,sometimes true,yes,some months but not every month,yes,yes,more than 10 years ago,...,,,,,,,,,,
3980,ns 3983,english,no,often true,often true,yes,almost every month,yes,yes,3 to 4 years ago,...,,,,,,,,,,thank you for the food support
4043,ns 3981,english,no,often true,sometimes true,yes,almost every month,yes,yes,3 to 4 years ago,...,,,,,,,,,,
4059,ns 3982,english,no,sometimes true,often true,yes,almost every month,yes,yes,1 to 2 years ago,...,canadian pension plan (cpp),private pension,old age security (oas),,,,,,,your help is appreciated thanks fpr help


In [10]:
# Group by 'ID'
grouped = duplicate_ids.groupby('id')

In [11]:
# Function to check if all rows in a group are identical
def check_identical(group):
    return group.nunique().max() == 1

In [12]:
# Check each group
for name, group in grouped:
    if not check_identical(group):
        print(f"ID {name} has different values at {group.id.index}")
        

ID ns 3980 has different values at Int64Index([32, 3973], dtype='int64')
ID ns 3981 has different values at Int64Index([2122, 4043], dtype='int64')
ID ns 3982 has different values at Int64Index([2140, 4059], dtype='int64')
ID ns 3983 has different values at Int64Index([95, 3980], dtype='int64')


In [13]:
# Since the data is different, I'll keep the data and change the id numbers at the later index:

# first, I'll get the indecies to change:
list_to_change = []

for name, group in grouped:
    list_to_change.append((name,group.id.index[1]))

# then, I'll change the values at the indecies:
for name, x in list_to_change:
    NS.id.loc[x] = 'x' + name
    print(NS.id.loc[x])

xns 3980
xns 3981
xns 3982
xns 3983


# Checking out null values:

In [14]:
# to find the number of null values in columns:
NS.isna().sum()

id          1
q001        2
q002       20
q003       30
q004       29
         ... 
q041r    3968
q041s    3836
q041t    3864
q041u    3989
q042     1835
Length: 136, dtype: int64

In order to find ways to fill the null values, I will first look into the questions.

In [15]:
# I will replace the column names with questions. to do so I will create a dictionary for the questions:
columns = { 'id'  :'participant_ID',
            'q001':'Language_of_Survey',
            'q002':'Enough_income',
            'q003':'Enough_food',
            'q004':'Afford_NBM',
            'q005':'skip_meal',
            'q006': 'frequency_in_year',
            'q007':'eat_less',
            'q008':'hungry',
            'q009':  'first_use_years',
            'q010':  'frequency_of_visits_to_OFB',
            'q011a':  'frequency_to_non_OFB',
            'q011b':  'frequency_to_non_OFB_other',
            'q012':  'which_food_bank',
            'q013':  'how_many_programs_per_month',
            'q014a':  'access_difficulty_none',
            'q014b':  'access_difficulty_language',
            'q014c':  'access_difficulty_physical_disability',
            'q014d':  'access_difficulty_safety_concerns',
            'q014e':  'access_difficulty_transportation',
            'q014f':  'access_difficulty_hours_of_operation',
            'q014g':  'access_difficulty_prefer_not_to_answer',
            'q014h':  'access_difficulty_others',
            'q015':  'time_to_get_to_most_visited_OFB',
            'q016a': 'mode_transportation_walk',
            'q016b':  'mode_transportation_cycle',
            'q016c':  'mode_transportation_public',
            'q016d':  'mode_transportation_private',
            'q016e':  'mode_transportation_prefer_not_to_answer',
            'q016f':  'mode_transportation_others',
            'q017a':  'other_services_housing_utilities',
            'q017b':  'other_services_computers_internet',
            'q017c':  'other_services_tax_clinic_financial_services',
            'q017d':  'other_services_legal',
            'q017e':  'other_services_navigation',
            'q017f':  'other_services_employment_income_support',
            'q017g':  'other_services_education',
            'q017h':  'other_services_childcare',
            'q017i':  'other_services_none',
            'q017j': 'other_services_prefer_not_to_answer',
            'q017k':  'other_services_other',
            'q018':  'new_service_suggestions',
            'q019':  'can_exercise_regularly',
            'q020':  'if_yes_how_often_exercise',
            'q021a':  'disease_prefer_not_to_answer',
            'q021b':  'disease_diabetes',
            'q021c':  'disease_high_blood_pressure',
            'q021d':  'disease_heart',
            'q021e':  'disease_none',
            'q021f':  'disease_other',
            'q022': 'pregnant',
            'q023a':  'service_enough_none_of_the_above',
            'q023b':  'service_enough_yes',
            'q023c': 'service_enough_no',
            'q023d':  'service_enough_sometime',
            'q023e': 'service_enough_prefer_not_to_answer',
            'q023f':  'service_enough_other',
            'q024a':  'food_type_halal',
            'q024b':  'food_type_kosher',
            'q024c':  'food_type_vegan_vegetarian',
            'q024d':  'food_type_medical_condition',
            'q024e':  'food_type_allergen_free',
            'q024f':  'food_type_country',
            'q024g':  'food_type_not_special',
            'q024h':  'food_type_prefer_not_to_answer',
            'q024i':  'food_type_other',
            'q025':  'age',
            'q026':  'number_household_members',
            'q027':  'have_children',
            'q028a':  'breastfeeding_yes',
            'q028b': 'breastfeeding_no',
            'q028c':  'breastfeeding_na',
            'q028d':  'breastfeeding_prefer_not_to_answer',
            'q029': 'children_number_under_3',
            'q030':  'children_number_3-5',
            'q031':  'children_number_6-12',
            'q032':  'children_number_13-17',
            'q033a':  'education_high_school_some',
            'q033b':  'education_high_school_completed',
            'q033c':  'education_college_some',
            'q033d':  'education_college_completed',
            'q033e':  'education_trades',
            'q033f':  'education_graduate_education_some',
            'q033g':  'education_graduate_education_completed',
            'q033h':  'education_professional_degree',
            'q033i':  'education_prefer_not_to_answer',
            'q034':  'education_outside_Canada',
            'q035a':  'status_in_canada',
            'q035b':  'status_in_canada_other',
            'q036a':  'gender',
            'q036b':  'gender_other',
            'q037':  'time_in_Canada',
            'q038a':  'ethnicity_Indigenous',
            'q038b':  'ethnicity_White/European',
            'q038c':  'ethnicity_Black_African_Caribbean',
            'q038d': 'ethnicity_Southeast_Asian',
            'q038e':  'ethnicity_East_Asian',
            'q038f':  'ethnicity_South_Asian',
            'q038g':  'ethnicity_Middle_Eastern',
            'q038h':  'ethnicity_Latin_American',
            'q038i':  'ethnicity_do_not_know',
            'q038j':  'ethnicity_prefer_not_to_answer',
            'q038k':  'ethnicity_other',
            'q039a':  'disability_none',
            'q039b':  'disability_physical',
            'q039c':  'disability_chronic_pain',
            'q039d':  'disability_sensory',
            'q039e':  'disability_developmental',
            'q039f':  'disability_learning',
            'q039g':  'disability_mental',
            'q039h':  'disability_prefer_not_to_answer',
            'q039i':  'disability_other',
            'q040a':  'housing_type_private',
            'q040b':  'housing_type_other',
            'q041a':  'income_source_employed_35_hours_plus',
            'q041b':  'income_source_employed_less_35_hours',
            'q041c':  'income_source_ODSP', 
            'q041d':  'income_source_OW',
            'q041e':  'income_source_CERB',
            'q041f':  'income_source_scholarship',
            'q041g':  'income_source_OSAP',
            'q041h':  'income_source_EI',
            'q041i':  'income_source_family_support',
            'q041j':  'income_source_spousal_support',
            'q041k': 'income_source_CCB',
            'q041l':  'income_source_OTB',
            'q041m':  'income_source_CPP',
            'q041n': 'income_source_private_pension',
            'q041o':  'income_source_OAS',
            'q041p':  'income_source_WSIB',
            'q041q':  'income_source_disability',
            'q041r':  'income_source_other_government_programs',
            'q041s':  'income_source_no_income',
            'q041t':  'income_source_prefer_not_to_answer',
            'q041u': 'income_source_other',
            'q042':'comments'}
              

In [16]:
# Replacing the column names with those in the dictionary above:
NS.columns = columns.values()

In [17]:
NS.head()

Unnamed: 0,participant_ID,Language_of_Survey,Enough_income,Enough_food,Afford_NBM,skip_meal,frequency_in_year,eat_less,hungry,first_use_years,...,income_source_CPP,income_source_private_pension,income_source_OAS,income_source_WSIB,income_source_disability,income_source_other_government_programs,income_source_no_income,income_source_prefer_not_to_answer,income_source_other,comments
0,ns 757,english,no,often true,often true,yes,almost every month,yes,yes,more than 10 years ago,...,,,,,,,,,,there should be some accountability for places...
1,ns 318,english,no,sometimes true,often true,yes,almost every month,yes,yes,5 to 10 years ago,...,,private pension,,,short/long term disability,,,,,
2,ns 328,english,no,sometimes true,sometimes true,no,,yes,don't know,5 to 10 years ago,...,canadian pension plan (cpp),,,,,,,,,not at the moment
3,ns 646,english,no,often true,often true,yes,some months but not every month,yes,yes,less than 1 year ago,...,,,,,,,,,,no
4,ns 678,english,prefer not to answer,sometimes true,sometimes true,yes,some months but not every month,prefer not to answer,yes,3 to 4 years ago,...,,,old age security (oas),,,,,,,keep up the good work yhat u do


In [18]:
NS.describe()

Unnamed: 0,participant_ID,Language_of_Survey,Enough_income,Enough_food,Afford_NBM,skip_meal,frequency_in_year,eat_less,hungry,first_use_years,...,income_source_CPP,income_source_private_pension,income_source_OAS,income_source_WSIB,income_source_disability,income_source_other_government_programs,income_source_no_income,income_source_prefer_not_to_answer,income_source_other,comments
count,4055,4054,4036,4026,4027,4040,2869,4018,4019,4037,...,99,41,81,28,32,88,220,192,67,2221
unique,4055,4,3,5,5,4,6,4,4,6,...,1,1,1,1,1,1,1,1,60,658
top,ns 757,english,no,sometimes true,sometimes true,yes,some months but not every month,yes,yes,1 to 2 years ago,...,canadian pension plan (cpp),private pension,old age security (oas),workplace safety and insurance board (wsib),short/long term disability,other government programs,no income,prefer not to answer,self employed,no
freq,1,3387,2874,1931,1907,2902,1247,2965,2508,1148,...,99,41,81,28,32,88,220,192,5,883


In [19]:
# Now, let's find out null values
NS.isna().sum()

participant_ID                                1
Language_of_Survey                            2
Enough_income                                20
Enough_food                                  30
Afford_NBM                                   29
                                           ... 
income_source_other_government_programs    3968
income_source_no_income                    3836
income_source_prefer_not_to_answer         3864
income_source_other                        3989
comments                                   1835
Length: 136, dtype: int64

In [20]:
# Let's check the null values in the participant_ID:
NS[NS.participant_ID.isna() == True].isna().all().all()

True

from the above, the record with no participant_ID has no data in all columns. Therefore, we can drop the record.

In [21]:
# first, I will find out its index:
NS[NS.participant_ID.isna() == True].index

Int64Index([3955], dtype='int64')

In [22]:
# then we'll drop the row:
NS.drop(index = 3955, inplace = True)

In [23]:
# Checking null values again:
NS.isna().sum()

participant_ID                                0
Language_of_Survey                            1
Enough_income                                19
Enough_food                                  29
Afford_NBM                                   28
                                           ... 
income_source_other_government_programs    3967
income_source_no_income                    3835
income_source_prefer_not_to_answer         3863
income_source_other                        3988
comments                                   1834
Length: 136, dtype: int64

In [24]:
# Checking null values in the Language_of_Survey:
NS[NS.Language_of_Survey.isna() == True]

Unnamed: 0,participant_ID,Language_of_Survey,Enough_income,Enough_food,Afford_NBM,skip_meal,frequency_in_year,eat_less,hungry,first_use_years,...,income_source_CPP,income_source_private_pension,income_source_OAS,income_source_WSIB,income_source_disability,income_source_other_government_programs,income_source_no_income,income_source_prefer_not_to_answer,income_source_other,comments
3279,ns,,,,,,,,,,...,,,,,,,,,,


We can see that the participant_ID is not null, but what about the other values for that record?

In [25]:
# Checking if there is any data at row 3279:
NS.iloc[3279, 1:].isna().all()

True

Since all the row has no data, we'll be dropping out the record.

In [26]:
# drop the row:
NS.drop(index = 3279, inplace = True)

In [27]:
# Checking null values again:
NS.isna().sum()

participant_ID                                0
Language_of_Survey                            0
Enough_income                                18
Enough_food                                  28
Afford_NBM                                   27
                                           ... 
income_source_other_government_programs    3966
income_source_no_income                    3834
income_source_prefer_not_to_answer         3862
income_source_other                        3987
comments                                   1833
Length: 136, dtype: int64

In [28]:
NS.shape

(4054, 136)

In [29]:
# we will now save this file as a csv file.
NS.to_csv('Cleaned_NS.csv', index = False)

In [31]:
# Checking data:
NS.describe()

Unnamed: 0,participant_ID,Language_of_Survey,Enough_income,Enough_food,Afford_NBM,skip_meal,frequency_in_year,eat_less,hungry,first_use_years,...,income_source_CPP,income_source_private_pension,income_source_OAS,income_source_WSIB,income_source_disability,income_source_other_government_programs,income_source_no_income,income_source_prefer_not_to_answer,income_source_other,comments
count,4054,4054,4036,4026,4027,4040,2869,4018,4019,4037,...,99,41,81,28,32,88,220,192,67,2221
unique,4054,4,3,5,5,4,6,4,4,6,...,1,1,1,1,1,1,1,1,60,658
top,ns 757,english,no,sometimes true,sometimes true,yes,some months but not every month,yes,yes,1 to 2 years ago,...,canadian pension plan (cpp),private pension,old age security (oas),workplace safety and insurance board (wsib),short/long term disability,other government programs,no income,prefer not to answer,self employed,no
freq,1,3387,2874,1931,1907,2902,1247,2965,2508,1148,...,99,41,81,28,32,88,220,192,5,883
