# Data Cleaning

Rachel Grace Treene | 4/7/23

This notebook handles missing values in the data based on the type of data in each feature.

In [68]:
import pandas as pd
import numpy as np

data = pd.read_csv("Tax_1867_Cleaned.csv")

In [69]:
data = data.drop(columns=['Unnamed: 0'])

In [70]:
data.columns

Index(['EventDateYear', 'EventImageLink', 'EventLocJurisdictionCounty',
       'EventTitle', 'PersonEventRole', 'PersonGivenNames',
       'PersonNameAlternate', 'PersonNameSuffix',
       'PersonRoleGivenNamesEmployer', 'PersonRoleLocResidence',
       'PersonRoleLocSurnameEmployer', 'PersonSurname',
       'PersonTaxCommissionerRemarks', 'PersonTaxCountCarriageWagon',
       'PersonTaxCountCattle', 'PersonTaxCountClocks', 'PersonTaxCountHogs',
       'PersonTaxCountHorsesMules', 'PersonTaxCountMusicalInstruments',
       'PersonTaxCountNMalesover16', 'PersonTaxCountSheep',
       'PersonTaxCountWMalesover16', 'PersonTaxCountWatches',
       'PersonTaxLeviedLand', 'PersonTaxStateAll', 'PersonTaxTotalCountyValue',
       'PersonTaxValueAggregatePersonlProperty', 'PersonTaxValueCarriageWagon',
       'PersonTaxValueCattle', 'PersonTaxValueClocks',
       'PersonTaxValueFurnishings', 'PersonTaxValueHogs',
       'PersonTaxValueHorsesMules', 'PersonTaxValueJewelry',
       'PersonTaxValue

In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12363 entries, 0 to 12362
Data columns (total 50 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   EventDateYear                           12363 non-null  int64  
 1   EventImageLink                          12363 non-null  object 
 2   EventLocJurisdictionCounty              12363 non-null  object 
 3   EventTitle                              12363 non-null  object 
 4   PersonEventRole                         12363 non-null  object 
 5   PersonGivenNames                        12363 non-null  object 
 6   PersonNameAlternate                     12363 non-null  object 
 7   PersonNameSuffix                        12363 non-null  object 
 8   PersonRoleGivenNamesEmployer            12363 non-null  object 
 9   PersonRoleLocResidence                  12363 non-null  object 
 10  PersonRoleLocSurnameEmployer            12363 non-null  ob

### EventLocJurisdictionCounty

In [72]:
data['EventLocJurisdictionCounty'].value_counts()

Buckingham    3216
Louisa        3017
Orange        2210
Fluvanna      2065
Cumberland    1795
Louisa          60
Name: EventLocJurisdictionCounty, dtype: int64

In [73]:
data.loc[data['EventLocJurisdictionCounty'] == 'Louisa ', 'EventLocJurisdictionCounty'] = 'Louisa'

In [74]:
data['EventLocJurisdictionCounty'].value_counts()

Buckingham    3216
Louisa        3077
Orange        2210
Fluvanna      2065
Cumberland    1795
Name: EventLocJurisdictionCounty, dtype: int64

### Trying to See Links

In [75]:
data['SourceType'].value_counts()

Government Record    12363
Name: SourceType, dtype: int64

In [76]:
data.loc[data['SourceAuthorName'] == 'G W Wright'][['EventDateYear', 'EventImageLink']]

Unnamed: 0,EventDateYear,EventImageLink
10153,1867,https://onesharedstory.org/HBCP/files/original...
10154,1867,https://onesharedstory.org/HBCP/files/original...
10155,1867,https://onesharedstory.org/HBCP/files/original...
10156,1867,https://onesharedstory.org/HBCP/files/original...
10157,1867,https://onesharedstory.org/HBCP/files/original...
...,...,...
12358,1867,https://onesharedstory.org/HBCP/files/original...
12359,1867,https://onesharedstory.org/HBCP/files/original...
12360,1867,https://onesharedstory.org/HBCP/files/original...
12361,1867,https://onesharedstory.org/HBCP/files/original...


Links are broken! Oh well.

### Replacing 0s with "" Where Appropriate

In [119]:
(data['PersonNameAlternate'] == '0').sum()

11266

In [120]:
zeros_present = ['PersonGivenNames', 'PersonNameAlternate', 'PersonNameSuffix', 'PersonRoleGivenNamesEmployer',
 'PersonRoleLocResidence', 'PersonRoleLocSurnameEmployer', 'PersonSurname', 'PersonTaxCommissionerRemarks']

In [131]:
for i in zeros_present:
    data.loc[data[i] == '0', i] = ""
    data.loc[data[i] == " ", i] = ""

In [142]:
(data['PersonNameAlternate'] == '0').sum()

0

In [143]:
data['PersonNameAlternate'].value_counts()

               11266
William          124
George            69
John              39
James             33
               ...  
Robert D           1
Robert A           1
Robert B           1
Elizabeth A        1
Maurice            1
Name: PersonNameAlternate, Length: 298, dtype: int64

### Output

In [145]:
data

Unnamed: 0,EventDateYear,EventImageLink,EventLocJurisdictionCounty,EventTitle,PersonEventRole,PersonGivenNames,PersonNameAlternate,PersonNameSuffix,PersonRoleGivenNamesEmployer,PersonRoleLocResidence,...,SourceAuthorName,SourceCreator,SourceDateYearCreated,SourceLocCity,SourceLocCreatedCounty,SourceLocState,SourceSteward,SourceTitle,SourceType,_id
0,1867,https://onesharedstory.org/HBCP/files/original...,Cumberland,Personal Property Tax Recorded,taxpayer,Joseph L,,,,,...,R B Trent,Cumberland County,1867,Richmond,Cumberland,Virginia,Library of Virginia,County Personal Property Taxes,Government Record,63e9361e59c84387372abcaf
1,1867,https://onesharedstory.org/HBCP/files/original...,Cumberland,Personal Property Tax Recorded,taxpayer,Baldwin T,,,,,...,R B Trent,Cumberland County,1867,Richmond,Cumberland,Virginia,Library of Virginia,County Personal Property Taxes,Government Record,63e9361e59c84387372abcb0
2,1867,https://onesharedstory.org/HBCP/files/original...,Cumberland,Personal Property Tax Recorded,taxpayer,Daniel N,,,,,...,R B Trent,Cumberland County,1867,Richmond,Cumberland,Virginia,Library of Virginia,County Personal Property Taxes,Government Record,63e9361e59c84387372abcb1
3,1867,https://onesharedstory.org/HBCP/files/original...,Cumberland,Personal Property Tax Recorded,taxpayer,Thomas,,Est,,,...,R B Trent,Cumberland County,1867,Richmond,Cumberland,Virginia,Library of Virginia,County Personal Property Taxes,Government Record,63e9361e59c84387372abcb2
4,1867,https://onesharedstory.org/HBCP/files/original...,Cumberland,Personal Property Tax Recorded,taxpayer,Benj J,Benjamin J,,,,...,R B Trent,Cumberland County,1867,Richmond,Cumberland,Virginia,Library of Virginia,County Personal Property Taxes,Government Record,63e9361e59c84387372abcb3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12358,1867,https://onesharedstory.org/HBCP/files/original...,Orange,Personal Property Tax Recorded,resident and taxpayer,Randall,,,William,,...,G W Wright,Orange County,1867,Richmond,Orange,Virginia,Library of Virginia,County Personal Property Taxes,Government Record,63e9362759c84387372aecf5
12359,1867,https://onesharedstory.org/HBCP/files/original...,Orange,Personal Property Tax Recorded,resident and taxpayer,Richard,,,John W,,...,G W Wright,Orange County,1867,Richmond,Orange,Virginia,Library of Virginia,County Personal Property Taxes,Government Record,63e9362759c84387372aecf6
12360,1867,https://onesharedstory.org/HBCP/files/original...,Orange,Personal Property Tax Recorded,resident and taxpayer,Smith Lewis,,,W P,,...,G W Wright,Orange County,1867,Richmond,Orange,Virginia,Library of Virginia,County Personal Property Taxes,Government Record,63e9362759c84387372aecf7
12361,1867,https://onesharedstory.org/HBCP/files/original...,Orange,Personal Property Tax Recorded,resident and taxpayer,Charles,,,R S,,...,G W Wright,Orange County,1867,Richmond,Orange,Virginia,Library of Virginia,County Personal Property Taxes,Government Record,63e9362759c84387372aecf8


In [146]:
data.to_csv('Tax_Record_1867.csv', index=False)