In [1]:
import os
import seaborn as sns
import copy

In [2]:
# Add current working directory to Python's sys path if not included
imp_mods = os.sys.path
my_pkg = os.getcwd()

if my_pkg not in imp_mods:
    imp_mods.append(my_pkg)
    print('Package imported.')

In [3]:
# import class CensusDataset from a customized module census_methods
from census_methods import CensusDataset as cd

In [4]:
# read-in data from csv file
census_df = sns.categorical.pd.read_csv('house_num_and_street_cleaned.csv')

In [5]:
census_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7581 entries, 0 to 7580
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   House Number                   7581 non-null   int64 
 1   Street                         7581 non-null   object
 2   First Name                     7579 non-null   object
 3   Surname                        7581 non-null   object
 4   Age                            7581 non-null   object
 5   Relationship to Head of House  7581 non-null   object
 6   Marital Status                 5904 non-null   object
 7   Gender                         7581 non-null   object
 8   Occupation                     7581 non-null   object
 9   Infirmity                      7581 non-null   object
 10  Religion                       5864 non-null   object
 11  Address                        7581 non-null   object
dtypes: int64(1), object(11)
memory usage: 710.8+ KB


### Gender

In [6]:
gend = copy.deepcopy(census_df['Gender'])

In [7]:
print(gend.unique())

['Female' 'Male' 'F' 'M' 'Fe-male' 'MALE' ' ' 'FEMALE']


#### Fix inconsistent gender values

In [8]:
converter = {'Female': 'F', 'Fe-male': 'F', 'FEMALE': 'F', 'MALE': 'M', 'Male': 'M'}
gend = cd.transform_val(gend, converter)

In [9]:
print(gend.unique())

['F' 'M' ' ']


In [10]:
# check for nan values
cd.null_checker(gend)

0

No blank/empty string in the Gender column

In [11]:
# check for blanks
cd.check_for_empty_str(census_df)['Gender']

True

<br><br>Gender column contains blank(s)

#### Fix blank genders

In [12]:
# Take a look at those having blank genders
blank_gends = census_df.loc[gend.loc[gend == ' '].index]
blank_gends

Unnamed: 0,House Number,Street,First Name,Surname,Age,Relationship to Head of House,Marital Status,Gender,Occupation,Infirmity,Religion,Address
503,32,Griffiths Inlet,Wendy,Mann,34,Wife,Married,,Occupational psychologist,,Christian,"32, Griffiths Inlet"
2418,25,Green Wall,Guy,Moore,31,Son,Single,,"Scientist, research (physical sciences)",,,"25, Green Wall"


Now we can easily tell their genders from their "Relationship to Head of House" column.<br>
***Wendy Mann is a wife - likely a female.<br>
Guy Moore is a son - likely a male.***

In [13]:
# assign Wendy Mann to gender F (female)
gend.loc[blank_gends.loc[(blank_gends['Surname'] == 'Mann') & (blank_gends['First Name'] == 'Wendy')].index] = 'F'

In [14]:
# assign Guy Moore to gender M (male)
gend.loc[blank_gends.loc[(blank_gends['Surname'] == 'Moore') & (blank_gends['First Name'] == 'Guy')].index] = 'M'

In [15]:
gend.loc[blank_gends.index]

503     F
2418    M
Name: Gender, dtype: object

In [16]:
print(gend.unique())

['F' 'M']


#### Replace the old "Gender" column in the original census_df dataframe with cleaned series, "gend".

In [17]:
census_df['Gender'] = gend

In [18]:
print(census_df['Gender'].unique())

['F' 'M']


___

### First Name

In [19]:
fnames = copy.deepcopy(census_df['First Name'])

In [20]:
# check for empty or blank strings
cd.check_for_empty_str(census_df)['First Name']

False

<br><br>No blank/empty string in the First Name column

In [21]:
# check for nan values
cd.null_checker(fnames)

2

<br><br>There are to 2 nan values in the First Name column

In [22]:
# Here they are:
null_fname = census_df.loc[census_df['First Name'].isnull()]
null_fname

Unnamed: 0,House Number,Street,First Name,Surname,Age,Relationship to Head of House,Marital Status,Gender,Occupation,Infirmity,Religion,Address
81,36,Morgan Avenue,,Jade Alexander,32,Head,Divorced,F,Educational psychologist,,Muslim,"36, Morgan Avenue"
1660,23,Belle Center,,James Young,50,Husband,Married,M,"Education officer, museum",,,"23, Belle Center"


In [23]:
household_mems = dict()
for i in null_fname.index:
    household_mems[i] = census_df.loc[census_df['Address'] == null_fname.loc[i, 'Address']]

<br><br>For Jade Alexander

In [24]:
household_mems[81]

Unnamed: 0,House Number,Street,First Name,Surname,Age,Relationship to Head of House,Marital Status,Gender,Occupation,Infirmity,Religion,Address
81,36,Morgan Avenue,,Jade Alexander,32,Head,Divorced,F,Educational psychologist,,Muslim,"36, Morgan Avenue"


I think that Jade Alexander errorneously filled in her first name and last name in the surname textbox.<br>
Unfortunately there are no other occupants in her household which could have been used to confirm their surname.<br>
**However, I am convinced that her first name is Jade and surname is Alexander.** And shall proceed to effect this changes

In [25]:
fn, ln = null_fname['Surname'].str.split().loc[81]
fnames.loc[81] = fn
census_df.loc[81, 'Surname'] = ln

In [26]:
census_df.loc[81]

House Number                                           36
Street                                      Morgan Avenue
First Name                                            NaN
Surname                                         Alexander
Age                                                    32
Relationship to Head of House                        Head
Marital Status                                   Divorced
Gender                                                  F
Occupation                       Educational psychologist
Infirmity                                            None
Religion                                           Muslim
Address                                 36, Morgan Avenue
Name: 81, dtype: object

<br><br>For James Young:

In [27]:
household_mems[1660]

Unnamed: 0,House Number,Street,First Name,Surname,Age,Relationship to Head of House,Marital Status,Gender,Occupation,Infirmity,Religion,Address
1659,23,Belle Center,Georgia,Young,52,Head,Married,F,Water quality scientist,,,"23, Belle Center"
1660,23,Belle Center,,James Young,50,Husband,Married,M,"Education officer, museum",,,"23, Belle Center"
1661,23,Belle Center,Vanessa,Young,16,Daughter,,F,Student,,,"23, Belle Center"


From James Young's wife and daughter's surnames, I am certain that their surname is Young.<br>
**Thus, I shall assign his First name as James and Surname as Young**

In [28]:
fn, ln = null_fname['Surname'].str.split().loc[1660]
fnames.loc[1660] = fn
census_df.loc[1660, 'Surname'] = ln

In [29]:
census_df.loc[1660]

House Number                                            23
Street                                        Belle Center
First Name                                             NaN
Surname                                              Young
Age                                                     50
Relationship to Head of House                      Husband
Marital Status                                     Married
Gender                                                   M
Occupation                       Education officer, museum
Infirmity                                             None
Religion                                              None
Address                                   23, Belle Center
Name: 1660, dtype: object

In [30]:
census_df.loc[fnames.loc[fnames.str.contains("-")].index]

Unnamed: 0,House Number,Street,First Name,Surname,Age,Relationship to Head of House,Marital Status,Gender,Occupation,Infirmity,Religion,Address
1415,5,Coconutbelt Lane,Emily-Anne,Williams,47,Head,Married,F,"Engineer, building services",,Methodist,"5, Coconutbelt Lane"
1456,4,North Road,Billy-Joel,Scott,28,Son,Single,M,Unemployed,,,"4, North Road"
5711,27,Calendar Ranch,Billy-Joe,Ali,11,Son,,M,Student,,,"27, Calendar Ranch"


#### Fix in-between spaces

In [31]:
census_df.loc[fnames.loc[fnames.str.contains(" ")].index]

Unnamed: 0,House Number,Street,First Name,Surname,Age,Relationship to Head of House,Marital Status,Gender,Occupation,Infirmity,Religion,Address
7199,48,Aethelstan Prairie,Emily Anne,Lawrence,62,Head,Single,F,"Education officer, environmental",,,"48, Aethelstan Prairie"


**NOTE:**<br>
For sake of consistency, I shall replace any in-between spaces with a dash "-"

In [32]:
fnames.loc[fnames.str.contains(" ")] = fnames.str.replace(" ", "-")

In [33]:
fnames.loc[fnames.str.contains("-")]

1415    Emily-Anne
1456    Billy-Joel
5711     Billy-Joe
7199    Emily-Anne
Name: First Name, dtype: object

#### Replace the old "First Name" column in the original census_df dataframe with cleaned series, "fnames".

In [34]:
census_df['First Name'] = fnames

In [35]:
cd.null_checker(census_df['First Name'])

0

___

### Surname

In [36]:
lnames = copy.deepcopy(census_df['Surname'])

In [37]:
# check for nan values
cd.null_checker(lnames)

0

No nan values in the Surname column<br><br>

In [38]:
# check for empty or blank strings
cd.check_for_empty_str(census_df)['Surname']

True

In [39]:
# check for in-between spaces
lnames.loc[lnames.str.contains(" ")]

5058     
Name: Surname, dtype: object

<br><br>Surname columns contains blank(s).<br>
They are:

In [40]:
blank_lname = census_df.loc[census_df['Surname'] == ' ']
blank_lname

Unnamed: 0,House Number,Street,First Name,Surname,Age,Relationship to Head of House,Marital Status,Gender,Occupation,Infirmity,Religion,Address
5058,114,James Views,Aaron,,34,Husband,Married,M,Data scientist,,,"114, James Views"


In [41]:
# look at surnames of other occupants in the same household
census_df.loc[census_df['Address'] == blank_lname.loc[5058, 'Address']]

Unnamed: 0,House Number,Street,First Name,Surname,Age,Relationship to Head of House,Marital Status,Gender,Occupation,Infirmity,Religion,Address
5057,114,James Views,Andrea,James,33,Head,Married,F,Dealer,,,"114, James Views"
5058,114,James Views,Aaron,,34,Husband,Married,M,Data scientist,,,"114, James Views"
5059,114,James Views,Samantha,James,4,Daughter,,F,Child,,,"114, James Views"


<br><br>It is clear that Aaron lives with his wife (Andrea) and daughter (Samantha), both of whose surname is James.<br>
**Hence, I shall be assigning the same surname, James, to Aaron.**

In [42]:
lnames.loc[5058] = 'James'

In [43]:
lnames.loc[lnames == ' ']

Series([], Name: Surname, dtype: object)

In [44]:
# check for in-between spaces
lnames.loc[lnames.str.contains("-")]

110            Williamson-Hill
111      Payne-Williamson-Hill
127           Harrison-Simpson
174              Kelly-Griffin
176              Kelly-Griffin
                 ...          
7478           Parker-Turnbull
7479           Parker-Turnbull
7480    Arnold-Parker-Turnbull
7532               Clark-Quinn
7566              Chapman-Cook
Name: Surname, Length: 271, dtype: object

#### Replace the old "Surname" column in the original census_df dataframe with cleaned series, "lnames".

In [45]:
census_df['Surname'] = lnames

In [46]:
# check for empty or blank strings
cd.check_for_empty_str(census_df)['Surname']

False

#### Save dataframe changes to filesystem as csv file

In [47]:
fname = "Name_and_Gender_cleaned.csv"
census_df.to_csv(fname, index=False)

___

**<center> THE END</center>**

___