In [6]:
# This converts a fixed width voter file from Allegheny County, such as the one from 2009-07-cd, to the same format
# as the later files.
# See https://drive.google.com/file/d/1x3UZwL7gmPT7jLJ1dBx3lR0CODjdGLyW/view?usp=sharing for format info
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

In [163]:
fwidths = [1,6,3,9,15,
           12,1,1,3,5,
           1,6,3,2,2,
           4,1,1,7,6,
           4,2,2,1,6,
           25,5,24,12,
           2,2,2,3,2,
           25,4,2,8,3,
           2,2,12,90]
cnames = ['c0','c1','c2','c3','c4',
          'c5','c6','c7','c8','c9',
          'c10','c11','c12','c13','c14',
          'c15','c16','c17','c18','c19',
          'c20','c21','c22','c23','c24',
          'c25','c26','c27','c28','c29',
          'c30','c31','c32','c33','c34',
          'c35','c36','c37','c38','c39',
          'c40','c41','c42','c43'
         ]

# Modify these values for each new CD
# date_cd_burned is used for fixing up empty Date_Registered and Date_Last_Changed
file_name = 'voters/2009-07-cd/ST072209.txt'
date_cd_burned = "7/22/2009"

df = pd.read_fwf(file_name, widths = fwidths,names = cnames,dtype=np.str)

In [164]:
# Process raw data into interesting column names
voters_addr_cols=['House__', 'HouseNoSuffix', 'StreetNameComplete' ,'City','Zip_Code']

p_colnames=['ID_Number','Voter_Status','Political_Party','Date_Last_Changed','Date_Of_Birth','Date_Registered','Vote_History']

In [165]:
# Format ID number as 9 char-'02'.  I don't know where the -02 comes from, but only 4 entries from 
# the 3 extracts from 2017-2018 differ from that pattern, and in each case they're clearly the same people
# as the other versions have for the matching first 10 chars of the ID with -02.
df['ID_Number'] = df['c3'].apply(lambda x: "%s-02"%(x[0:9]))

In [166]:
# Strip any leading zeroes from c9 to generate house # column HOUSE__
df['House__'] = df['c9'].apply(lambda x: x.lstrip('0'))

In [167]:
df['HouseNoSuffix'] = df['c10'].apply(lambda x: x.strip() if x!='nan' else None)

In [168]:
df['StreetNameComplete'] = df['c25'].apply(lambda x: x.strip() if x!='nan' else None)

In [169]:
df['City'] = df['c27'].apply(lambda x: x.strip())

In [170]:
df['Zip_Code'] = df['c26']

In [171]:
df['Voter_Status'] = df['c23']

In [172]:
df['Political_Party'] = df['c16']

In [174]:
# Find all the rows where 'c19' (proto 'Date_Of_Birth') is malformed.  
# At least for 2005-02, these were all 'nan'
len(df[~df['c19'].str.contains('^\d\d\d\d\d\d$')])

0

In [176]:
# DOB is 6 chars in Mo-Da-Yr format.  Can assume year is 19YY since voters are >= 18 yo and this data is from 2009
df['Date_Of_Birth'] = df['c19'].apply(lambda x: "%s/%s/19%s"%(x[0:2],x[2:4],x[4:6]))

In [177]:
# 'Date_Registered' is 4 chars in Mo-Yr format.  Assume it's 20YY if YY<20
df['Date_Registered'] = df['c20'].apply(lambda x: None if x=='nan' else ("%s/01/19%s"%(x[0:2],x[2:4]) if int(x[2:4]>="20") else "%s/01/20%s"%(x[0:2],x[2:4])))

In [175]:
# Find all the rows where 'c1' (proto 'Date_Last_Changed') is malformed.  
# At least for 2005-02, these were all 'nan'
len(df[~df['c1'].str.contains('^\d\d\d\d\d\d$')])

0

In [178]:
# 'Date_Last_Changed' is 6 chars in Mo-Da-Yr format.  Assume it's 20YY if YY<20
df['Date_Last_Changed'] = df['c1'].apply(lambda x: "%s/%s/19%s"%(x[0:2],x[2:4],x[4:6]) if int(x[4:6])>=20 else "%s/%s/20%s"%(x[0:2],x[2:4],x[4:6]))

In [179]:
# Create 'Vote_History' column by stripping away the first 4 filler characters 
# from 'c42'.  See 'Voter List Instructions' file in Google Drive Folder 
# (https://drive.google.com/drive/folders/1Dp79JX6LUp17gFAabPmJGuY1MP6TpKKk) for details
df['Vote_History'] = df['c42'].str.replace('^0000','',regex=True)

In [159]:
# Clean up for saving
save_colnames = p_colnames + voters_addr_cols

In [160]:
df_out = df[save_colnames]

In [162]:
df_out_pathname = "voters/2009-07-cd/VOTERS07_22_2009.TXT"
df_out.to_csv(df_out_pathname,sep="\t")

In [105]:
df[['c3','ID_Number','c9','House__', 'c10','HouseNoSuffix','StreetNameComplete','City','Zip_Code','Voter_Status','c19','Date_Of_Birth','c20','Date_Registered','c1','Date_Last_Changed']][0:5]

Unnamed: 0,c3,ID_Number,c9,House__,c10,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Voter_Status,c19,Date_Of_Birth,c20,Date_Registered,c1,Date_Last_Changed
0,1002451,001002451-02,5847,5847,,,MERIDIAN RD,GIBSONIA PA,15044,A,32618,03/26/1918,151,01/01/1951,11205,01/12/2005
1,1010244,001010244-02,2420,2420,,,MASONIC DR,SEWICKLEY PA,15143,A,50827,05/08/1927,166,01/01/1966,12309,01/23/2009
2,1015333,001015333-02,101,101,,,BRISTOL SQ,PITTSBURGH PA,15238,A,30639,03/06/1939,171,01/01/1971,71307,07/13/2007
3,1015655,001015655-02,203,203,,,HUGEL DR,PITTSBURGH PA,15209,A,12243,01/22/1943,172,01/01/1972,101403,10/14/2003
4,1020958,001020958-02,126,126,,,SYCAMORE ST W,PITTSBURGH PA,15211,A,82278,08/22/1978,896,08/01/1996,20508,02/05/2008


In [20]:
df[0:5]

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,ID_Number
0,,11205,10,1002451,FAGER,ELISABETH,R,,110.0,5847,,111111,194,0,1,0,R,U,84,32618,151,0,2,A,0,MERIDIAN RD,15044,GIBSONIA PA,RICHLAND,4,40,28,246,4,PINE-RICHLAND - REGION 2,1941,42,113,777,0,1,RICHLAND,0000000000000000000000000000000000000000000000...,,0010024-51
1,,12309,10,1010244,REID,MARJORIE,M,,,2420,,111111,101,0,1,0,R,U,84,50827,166,0,1,A,0,MASONIC DR,15143,SEWICKLEY PA,ALEPPO,4,37,44,302,67,QUAKER VALLEY - REGION 3,0,63,103,401,0,1,ALEPPO,0000000000000000000000000000000000000000000000...,,0010102-44
2,,71307,10,1015333,CLEVELAND,SANDRA,L,,,101,,111111,182,2,2,0,R,F,84,30639,171,0,3,A,0,BRISTOL SQ,15238,PITTSBURGH PA,OHARA,4,38,30,204,26,FOX CHAPEL AREA-REGION 1,0,52,19,725,2,2,OHARA,0000000000000000000000000000000000000000000000...,,0010153-33
3,,101403,10,1015655,EVANS,RICHARD,W,,,203,,111111,202,1,5,0,D,M,84,12243,172,0,3,A,0,HUGEL DR,15209,PITTSBURGH PA,SHALER,4,40,21,203,70,SHALER AREA - REGION 2,0,52,199A,809,1,5,SHALER,0000000000000000000000000000000000000000000001...,,0010156-55
4,,20508,10,1020958,FULS,KELLY,S,,1.0,126,,111111,188,19,6,0,D,U,84,82278,896,2,12,A,0,SYCAMORE ST W,15211,PITTSBURGH PA,PITTSBURGH,14,42,22,238,60,PITTSBURGH DISTRICT-6,1603,11,137,753,19,6,PITTSBURGH,0000000000000000000000000000000000000000000000...,,0010209-58


In [None]:
df[df['c10']!='nan'][['c10','HouseNoSuffix']][0:5]

In [185]:
# Sanity check 'Date_Registered' and 'Date_Of_Birth'.  Result should be empty
dob_dr_mismatch = pd.to_datetime(df['Date_Registered'])<pd.to_datetime(df['Date_Of_Birth'])
df[dob_dr_mismatch][['ID_Number','c19','Date_Of_Birth','c20','Date_Registered']]

Unnamed: 0,ID_Number,c19,Date_Of_Birth,c20,Date_Registered
2819,001520420-02,082777,08/27/1977,0866,08/01/1966
3432,001549940-02,081453,08/14/1953,0353,03/01/1953
3531,001555962-02,120470,12/04/1970,1270,12/01/1970
3956,001580159-02,062780,06/27/1980,0954,09/01/1954
5671,001908780-02,101639,10/16/1939,0822,08/01/1922
11281,001920986-02,061725,06/17/1925,0425,04/01/1925
12936,001924580-02,062243,06/22/1943,0634,06/01/1934
13529,001925869-02,052628,05/26/1928,0822,08/01/1922
14736,001928263-02,112044,11/20/1944,1037,10/01/1937
14946,001928625-02,031227,03/12/1927,0920,09/01/1920


In [184]:
df[df['Date_Registered'].str.contains("na") | df['Date_Of_Birth'].str.contains("na")][['c19','Date_Of_Birth','c20','Date_Registered']]

Unnamed: 0,c19,Date_Of_Birth,c20,Date_Registered


In [116]:
no_reg_df = df[pd.isna(df['Date_Registered'])]

In [121]:
df[df['c42'].str.contains('^00002',regex=True)]

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,ID_Number,House__,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Date_Of_Birth,Voter_Status,Political_Party,Date_Registered,Date_Last_Changed
4790,,010798,010,001907029,KEARCHER,RICHARD,J,,,00010,,111111,101,00,01,0000,R,M,84,091227,0949,00,01,A,000000,RHODES AVE,15143,SEWICKLEY PA,ALEPPO,04,37,44,302,67,QUAKER VALLEY - REGION 3,0000,63,103,401,00,01,ALEPPO,0000221121212121112112112212212121112111112112...,,001907029-02,10,,RHODES AVE,SEWICKLEY PA,15143,09/12/1927,A,R,09/01/1949,01/07/1998
4808,,011508,010,001907060,HERMES,LAVERNE,M,,1,00125,,111111,102,00,01,0000,R,F,84,030326,0260,00,03,A,000000,1ST ST,15215,PITTSBURGH PA,ASPINWALL,04,38,24,204,28,FOX CHAPEL AREA-REGION 3,0000,42,21,405,00,01,ASPINWALL,0000222111222121112111112121112121122121112121...,,001907060-02,125,,1ST ST,PITTSBURGH PA,15215,03/03/1926,A,R,02/01/1960,01/15/2008
4809,,120586,010,001907061,HUNT,CATHERINE,B,,410,00701,,111111,111,00,01,0000,D,F,84,070517,0856,00,03,A,000000,CENTER AVE,15238,PITTSBURGH PA,BLAWNOX,14,38,32,204,28,FOX CHAPEL AREA-REGION 3,0000,41,22,441,00,01,BLAWNOX,0000212121221111111111111111111111121111111211...,,001907061-02,701,,CENTER AVE,PITTSBURGH PA,15238,07/05/1917,A,D,08/01/1956,12/05/1986
4812,,022499,010,001907065,LIPINSKI,LEO,S,,208,00601,,111111,203,00,01,0000,D,M,84,050234,0856,00,03,A,000000,MAIN ST,15215,PITTSBURGH PA,SHARPSBURG,14,38,21,204,26,FOX CHAPEL AREA-REGION 1,0000,42,15,813,00,01,SHARPSBURG,0000222111211111111121111111111111111111111111...,,001907065-02,601,,MAIN ST,PITTSBURGH PA,15215,05/02/1934,A,D,08/01/1956,02/24/1999
4818,,082390,010,001907078,ERWIN,JOAN,A,,,00107,,111111,102,00,03,0000,R,F,84,011935,0964,00,03,A,000000,LEXINGTON AVE,15215,PITTSBURGH PA,ASPINWALL,04,38,24,204,28,FOX CHAPEL AREA-REGION 3,0000,42,21,405,00,03,ASPINWALL,0000222222222222222222212222212222222221222222...,,001907078-02,107,,LEXINGTON AVE,PITTSBURGH PA,15215,01/19/1935,A,R,09/01/1964,08/23/1990
4823,,030148,010,001907085,PIATKOWSKI,LEO,C,,,00112,,111111,102,00,03,0000,D,M,84,111923,0348,00,03,A,000000,EMERSON AVE,15215,PITTSBURGH PA,ASPINWALL,04,38,24,204,28,FOX CHAPEL AREA-REGION 3,0000,42,21,405,00,03,ASPINWALL,0000211111121122111112111111111112112211111121...,,001907085-02,112,,EMERSON AVE,PITTSBURGH PA,15215,11/19/1923,A,D,03/01/1948,03/01/1948
4824,,121494,010,001907086,RANII,JAMES,F,,,00311,,111111,102,00,03,0000,D,M,84,072124,0366,00,03,A,000000,MAPLE AVE,15215,PITTSBURGH PA,ASPINWALL,04,38,24,204,28,FOX CHAPEL AREA-REGION 3,0000,42,21,405,00,03,ASPINWALL,0000211121111111111112111121111111111111111111...,,001907086-02,311,,MAPLE AVE,PITTSBURGH PA,15215,07/21/1924,A,D,03/01/1966,12/14/1994
4829,,031987,010,001907095,GOBERISH,FRANK,,,,00512,,111111,121,00,02,0000,D,M,84,041628,0953,00,07,A,000000,HILL AVE,15024,CHESWICK PA,CHESWICK,04,45,33,303,01,ALLEGHENY VALLEY,0000,33,48,481,00,02,CHESWICK,0000222122222111222121111211212111211121121121...,,001907095-02,512,,HILL AVE,CHESWICK PA,15024,04/16/1928,A,D,09/01/1953,03/19/1987
4832,,092702,010,001907098,KALSEY,MARY ANNA,,,2,00231,,111111,102,00,01,0000,R,F,84,082912,0952,00,03,A,000000,5TH ST,15215,PITTSBURGH PA,ASPINWALL,04,38,24,204,28,FOX CHAPEL AREA-REGION 3,0000,42,21,405,00,01,ASPINWALL,0000212111221121112111211121122221212221222222...,,001907098-02,231,,5TH ST,PITTSBURGH PA,15215,08/29/1912,A,R,09/01/1952,09/27/2002
4834,,121007,010,001907102,SHOUP,D,E,,,00178,,111111,178,00,03,0000,R,M,84,012543,0768,00,04,I,000000,DARLENE DR,15108,CORAOPOLIS PA,N FAYETTE,18,37,44,317,79,WEST ALLEGHENY REGION 2,0000,43,111,709,00,03,N FAYETTE,0000211121211111111221111222222222211111212222...,,001907102-02,178,,DARLENE DR,CORAOPOLIS PA,15108,01/25/1943,I,R,07/01/1968,12/10/2007


In [126]:
df[~(df['c42'].str.contains('[12]',regex=True))]

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,ID_Number,House__,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Date_Of_Birth,Voter_Status,Political_Party,Date_Registered,Date_Last_Changed
119,,070909,010,001060718,VENDITTI,LINDSAY,M,,404,05435,,111111,188,07,03,0000,R,F,84,082983,1201,08,11,A,000000,CLAYBOURNE ST,15232,PITTSBURGH PA,PITTSBURGH,14,43,21,235,55,PITTSBURGH DISTRICT-1,0503,31,126,753,07,03,PITTSBURGH,0000000000000000000000000000000000000000000000...,,001060718-02,5435,,CLAYBOURNE ST,PITTSBURGH PA,15232,08/29/1983,A,R,12/01/2001,07/09/2009
168,,062209,010,001097153,TUMA,KENDRA,C,,,02359,,111111,188,02,02,0000,D,F,84,050648,0189,06,13,A,000000,RAILROAD ST,15222,PITTSBURGH PA,PITTSBURGH,14,42,20,228,56,PITTSBURGH DISTRICT-2,0100,11,129,753,02,02,PITTSBURGH,0000000000000000000000000000000000000000000000...,,001097153-02,2359,,RAILROAD ST,PITTSBURGH PA,15222,05/06/1948,A,D,01/01/1989,06/22/2009
282,,063009,010,001313643,DERAMO,VINCENT,,,103,00811,,111111,188,28,10,0000,D,M,84,110119,0443,02,12,A,000000,VILLAGE RD,15205,PITTSBURGH PA,PITTSBURGH,14,42,27,243,63,PITTSBURGH DISTRICT-9,2306,12,199A,753,28,10,PITTSBURGH,0000000000000000000000000000000000000000000000...,,001313643-02,811,,VILLAGE RD,PITTSBURGH PA,15205,11/01/1919,A,D,04/01/1943,06/30/2009
363,,063009,010,001319704,BRINZA,DANIEL,N,,,00506,,111111,141,00,03,0000,D,M,84,091978,0299,00,04,A,000000,RED DEER LN,15108,CORAOPOLIS PA,FINDLAY,18,37,44,317,80,WEST ALLEGHENY REGION 3,0000,43,99,561,00,03,FINDLAY,0000000000000000000000000000000000000000000000...,,001319704-02,506,,RED DEER LN,CORAOPOLIS PA,15108,09/19/1978,A,D,02/01/1999,06/30/2009
365,,070909,010,001319841,HOENIG,ANGELA,M,,,05606,,111111,188,31,06,0000,R,F,84,022572,0399,05,11,A,000000,CENTURY AVE,15207,PITTSBURGH PA,PITTSBURGH,14,43,36,236,59,PITTSBURGH DISTRICT-5,1300,33,134,753,31,06,PITTSBURGH,0000000000000000000000000000000000000000000000...,,001319841-02,5606,,CENTURY AVE,PITTSBURGH PA,15207,02/25/1972,A,R,03/01/1999,07/09/2009
383,,062209,010,001320386,DOURLAIN,BRIAN,K,,,00051,,111111,202,01,05,0000,R,M,84,120474,0399,00,03,A,000000,ELIZABETH ST,15209,PITTSBURGH PA,SHALER,04,40,21,203,70,SHALER AREA - REGION 2,0000,52,199A,809,01,05,SHALER,0000000000000000000000000000000000000000000000...,,001320386-02,51,,ELIZABETH ST,PITTSBURGH PA,15209,12/04/1974,A,R,03/01/1999,06/22/2009
409,,050509,010,001320827,PETTI,MICHAEL,R,,,00333,,111111,165,00,03,0000,D,M,84,040481,0499,00,02,A,000000,MARSHALL HEIGHTS DR,15090,WEXFORD PA,MARSHALL,04,40,28,212,50,NORTH ALLEGHENY,0000,31,199A,657,00,03,MARSHALL,0000000000000000000000000000000000000000000000...,,001320827-02,333,,MARSHALL HEIGHTS DR,WEXFORD PA,15090,04/04/1981,A,D,04/01/1999,05/05/2009
607,,071509,010,001326095,MARCHIONDA,CARMEN,J,,,00309,,111111,172,00,05,0000,R,F,84,072676,0500,00,01,A,000000,MORAY DR,15108,CORAOPOLIS PA,MOON,18,37,44,225,39,MOON AREA,0000,33,61,685,00,05,MOON,0000000000000000000000000000000000000000000000...,,001326095-02,309,,MORAY DR,CORAOPOLIS PA,15108,07/26/1976,A,R,05/01/2000,07/15/2009
627,,071309,010,001326490,MURR,SAMUEL,J,,,02500,,111111,222,00,13,0000,M,M,84,101069,0389,00,09,A,000000,JEFFERSON DR,15122,WEST MIFFLIN PA,WEST MIFFLIN,14,43,38,214,82,WEST MIFFLIN AREA,0000,31,48,889,00,13,WEST MIFFLIN,0000000000000000000000000000000000000000000000...,,001326490-02,2500,,JEFFERSON DR,WEST MIFFLIN PA,15122,10/10/1969,A,M,03/01/1989,07/13/2009
809,,063009,010,001330452,WENTZEL,ERIC,M,,,00768,,111111,173,04,01,0000,M,M,84,082879,1200,00,05,A,000000,SHADY DR E,15228,PITTSBURGH PA,MT LEBANON,18,37,42,219,40,MT LEBANON,0000,42,12,689,04,01,MT LEBANON,0000000000000000000000000000000000000000000000...,,001330452-02,768,,SHADY DR E,PITTSBURGH PA,15228,08/28/1979,A,M,12/01/2000,06/30/2009


In [180]:
voter_map['001062509-02']['reg_date'].strftime('%m/%d/%Y')

'02/27/2002'

In [127]:
import math

for i in range(0,len(no_reg_df)):
    vid = df['ID_Number'].iloc[i]
    if(not vid in voter_map):
        print "No known reg_date for %s [%d]" %(vid,i)
    else:
        print "Have reg_date for %s [%d] = %s" %(vid,i,voter_map[vid]['reg_date'])
    # Get voting history detail
    vh_str = df['c42'].iloc[i]        
    # Find first instance of 1 or 2
    vi = vh_str.find('1')
    di = vh_str.find('2')
    # If this person has never voted then both vi and di will be -1.  In that case 
    # set reg_date to 07/01/2009 (quite a few people with no voting record were born in 1991)
    fv_year=None
    if(vi == -1 and di==-1):
        min_i=-1
        fv_year = 2009
    elif(vi==-1):
        vi=di
    elif(di==-1):
        di=vi
        
    if(not fv_year):
        min_i = min(vi,di)
        # indices 0-3 are filler and are all zeros
        # index 4-5 are 1983, 6-7 are 1984, etc
        fv_year = math.floor((min_i-4)/2)+1983
        
    print "\tFirst non-zero vh = %d (%d)" % (min_i, fv_year)


No known reg_date for 001002451-02 [0]
	First non-zero vh = 47 (2004)
No known reg_date for 001010244-02 [1]
	First non-zero vh = 55 (2008)
Have reg_date for 001015333-02 [2] = 1971-01-01 00:00:00
	First non-zero vh = 53 (2007)
Have reg_date for 001015655-02 [3] = 1972-01-01 00:00:00
	First non-zero vh = 45 (2003)
No known reg_date for 001020958-02 [4]
	First non-zero vh = 50 (2006)
Have reg_date for 001022672-02 [5] = 1996-09-24 00:00:00
	First non-zero vh = 52 (2007)
Have reg_date for 001022781-02 [6] = 1996-09-26 00:00:00
	First non-zero vh = 52 (2007)
Have reg_date for 001023431-02 [7] = 1996-10-09 00:00:00
	First non-zero vh = 52 (2007)
No known reg_date for 001023832-02 [8]
	First non-zero vh = 54 (2008)
Have reg_date for 001023977-02 [9] = 1996-10-14 00:00:00
	First non-zero vh = 49 (2005)
No known reg_date for 001024171-02 [10]
	First non-zero vh = 46 (2004)
Have reg_date for 001024395-02 [11] = 1996-10-24 00:00:00
	First non-zero vh = 50 (2006)
Have reg_date for 001024474-02 [

In [147]:
# Given the value of 'ID_Number', original value of 'Date_Registered', and 'c42' (voting history), return a new value for 'Date_Registered'
def vid_to_reg_date(vid, reg_date_init, vh_str):
    # If reg_date_init is non-null, return it
    if(reg_date_init):
        return reg_date_init
    # Don't have it.  Check to see if vid is in voter_map.  If so, return reg_date
    if(vid in voter_map):
        return voter_map[vid]['reg_date'].strftime('%m/%d/%Y')
    
    # Estimate registration date from voting history detail
    # Find first instance of 1 or 2
    vi = vh_str.find('1')
    di = vh_str.find('2')
    # If this person has never voted then both vi and di will be -1.  In that case 
    # set reg_date to 07/01/2009 (quite a few people with no voting record were born in 1991)
    fv_year=None
    if(vi == -1 and di==-1):
        return("07/01/2009")
    elif(vi==-1):
        vi=di
    elif(di==-1):
        di=vi
        
    min_i = min(vi,di)
    # indices 0-3 are filler and are all zeros
    # index 4-5 are 1983, 6-7 are 1984, etc
    fv_year = int(math.floor((min_i-4)/2)+1983)
    return("01/01/{:04d}".format(fv_year))

In [181]:
# Clean up 
df['Date_Registered_orig']=df['Date_Registered']

In [182]:
df['Date_Registered']=df[['ID_Number','Date_Registered','c42']].apply(lambda x: vid_to_reg_date(x[0],x[1],x[2]),axis=1)

In [63]:
import dateparser

date_reg_str = "10/1/2002"
date_reg_dp = dateparser.parse(date_reg_str)


In [114]:
df[df['ID_Number']=='002748431-02']

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,ID_Number,House__,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Date_Of_Birth,Voter_Status,Political_Party,Date_Registered,Date_Last_Changed
470856,,31892,10,2748431,FELIX,ROBERT,E,J,,3050,,111111,205,0,1,0,D,M,84,22243,944,0,6,A,0,FELIX DR,15129,SOUTH PARK PA,SOUTH PARK,18,37,39,216,74,SOUTH PARK,0,33,59,821,0,1,SOUTH PARK,0000121121211211222222112221212121222121212211...,,002748431-02,3050,,FELIX DR,SOUTH PARK PA,15129,02/22/1943,A,D,09/01/1944,03/18/1992


In [92]:
df[df['Date_Of_Birth']=='02/22/1943']

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,ID_Number,House__,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Date_Of_Birth,Voter_Status,Political_Party,Date_Registered,Date_Last_Changed
5719,,90166,10,1908895,SIMONI,MARY,C,,,3123,,111111,116,0,3,0,D,F,84,22243,966,0,6,A,0,ELROY AVE,15227,PITTSBURGH PA,BRENTWOOD,18,42,36,218,8,BRENTWOOD,0,33,48,461,0,3,BRENTWOOD,0000111111111121111111111111111111111111111111...,,0019088-95,3123,,ELROY AVE,PITTSBURGH PA,15227,02/22/1943,A,D,09/01/1966,09/01/1966
6528,,30205,10,1910569,LESKOVITZ,MILDRED,,,,112,,111111,224,0,1,0,D,F,84,22243,964,0,8,A,0,DORA ST,15120,HOMESTEAD PA,WHITAKER,14,43,35,214,82,WEST MIFFLIN AREA,0,31,48,897,0,1,WHITAKER,0000121121121111111212112211111111111111111111...,,0019105-69,112,,DORA ST,HOMESTEAD PA,15120,02/22/1943,A,D,09/01/1964,03/02/2005
9094,,82202,10,1916041,MCGRAW,ELLEN,E,,,1426,,111111,215,1,2,0,D,F,84,22243,364,0,5,A,0,DEEP WOOD DR,15241,PITTSBURGH PA,UP ST CLAIR,18,37,40,220,77,UPPER ST CLAIR,0,43,32,861,1,2,UP ST CLAIR,0000221112222221222222212221222122222221222122...,,0019160-41,1426,,DEEP WOOD DR,PITTSBURGH PA,15241,02/22/1943,A,D,03/01/1964,08/22/2002
21160,,72387,10,1938641,GLOVER,JOSEPH,N,,,523,,111111,228,3,6,0,D,M,84,22243,476,0,10,A,0,NORTH AVE,15221,PITTSBURGH PA,WILKINSBURG,14,38,24,210,83,WILKINSBURG,0,11,33,913,3,6,WILKINSBURG,0000111111211111112121212121222111212121112222...,,0019386-41,523,,NORTH AVE,PITTSBURGH PA,15221,02/22/1943,A,D,04/01/1976,07/23/1987
69951,,100400,10,2024250,HELD,DEANNA,,,,81,,111111,104,0,16,0,D,F,84,22243,964,0,6,A,0,WALTON RD,15236,PITTSBURGH PA,BALDWIN BR,14,43,36,218,6,BALDWIN-WHITEHALL,0,32,48,413,0,16,BALDWIN BR,0000111111211221122111111111112111211121121111...,,0020242-50,81,,WALTON RD,PITTSBURGH PA,15236,02/22/1943,A,D,09/01/1964,10/04/2000
76551,,120105,10,2038396,THEINER,BONNIE,P,,26.0,1660,,111111,188,14,4,0,D,F,84,22243,968,8,11,A,0,MURRAY AVE,15217,PITTSBURGH PA,PITTSBURGH,14,43,23,235,58,PITTSBURGH DISTRICT-4,1100,22,127,753,14,4,PITTSBURGH,0000112122222111221221212121212122111111111211...,,0020383-96,1660,,MURRAY AVE,PITTSBURGH PA,15217,02/22/1943,A,D,09/01/1968,12/01/2005
97618,,10606,10,2073971,COLEMAN,W,S,,,222,,111111,134,0,1,0,D,M,84,22243,1084,0,8,A,0,ELM ST,15218,PITTSBURGH PA,EDGEWOOD,14,43,34,208,44,WOODLAND HILLS - REG. 4,0,33,121,533,0,1,EDGEWOOD,0000221122222221222121212221222121212121221121...,,0020739-71,222,,ELM ST,PITTSBURGH PA,15218,02/22/1943,A,D,10/01/1984,01/06/2006
111604,,12309,10,2103270,ALTMAN,JOSEPH,H,,,503,,111111,165,0,1,0,R,M,84,22243,394,0,2,A,0,CAMBERLY CT,16046,MARS PA,MARSHALL,4,40,28,212,50,NORTH ALLEGHENY,0,31,58,657,0,1,MARSHALL,0000000000000000000000000021222122222121222222...,,0021032-70,503,,CAMBERLY CT,MARS PA,16046,02/22/1943,A,R,03/01/1994,01/23/2009
143748,,80990,10,2160136,MCKEE,CAROL,M,,,431,,111111,189,0,10,0,R,F,84,22243,869,0,6,A,0,TORWOOD LN,15236,PITTSBURGH PA,PLEASANT HL,18,37,38,216,81,WEST JEFFERSON HILLS,0,31,199A,757,0,10,PLEASANT HL,0000222222222222222121112121112121111211211111...,,0021601-36,431,,TORWOOD LN,PITTSBURGH PA,15236,02/22/1943,A,R,08/01/1969,08/09/1990
175370,,120999,10,2218331,GILBERT,RONALD,,,,448,,111111,123,4,2,0,D,M,84,22243,393,0,6,A,0,ST CLAIR AVE,15025,CLAIRTON PA,CLAIRTON,14,45,39,309,88,CLAIRTON - REGION 4,0,31,199A,489,4,2,CLAIRTON,0000000000000000000000001112211112111111112112...,,0022183-31,448,,ST CLAIR AVE,CLAIRTON PA,15025,02/22/1943,A,D,03/01/1993,12/09/1999


In [96]:
df.iloc[470856]

c0                                                                  nan
c1                                                               031892
c2                                                                  010
c3                                                            002748431
c4                                                                FELIX
c5                                                               ROBERT
c6                                                                    E
c7                                                                    J
c8                                                                  nan
c9                                                                03050
c10                                                                 nan
c11                                                              111111
c12                                                                 205
c13                                                             

In [109]:
df[(df['c5'].str.contains('RAY')) & (df['c4']=='SIMS')]

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,ID_Number,House__,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Date_Of_Birth,Voter_Status,Political_Party,Date_Registered,Date_Last_Changed
154882,,13002,10,2178167,SIMS,RAYMOND,,,,3632,,111111,188,4,2,0,D,M,84,20860,1079,6,10,A,0,FRAZIER ST,15213,PITTSBURGH PA,PITTSBURGH,14,43,19,227,59,PITTSBURGH DISTRICT-5,300,22,141,753,4,2,PITTSBURGH,0000222222222221222222212222222222222222222222...,,002178167-02,3632,,FRAZIER ST,PITTSBURGH PA,15213,02/08/1960,A,D,10/01/1979,01/30/2002
257443,,72208,10,2365197,SIMS,RAYNELL,L,,,712,,111111,188,10,16,0,D,F,84,91776,1000,9,13,A,0,ATLANTIC AVE N,15224,PITTSBURGH PA,PITTSBURGH,14,38,21,231,57,PITTSBURGH DISTRICT-3,700,31,129,753,10,16,PITTSBURGH,0000000000000000000000000000000000000002222222...,,002365197-02,712,,ATLANTIC AVE N,PITTSBURGH PA,15224,09/17/1976,A,D,10/01/1900,07/22/2008


In [113]:
voter_map['002178167-02']

{'DOB': '2/8/1960',
 'addresses': {'2017-11': '3632 FRAZIER ST PITTSBURGH PA 15213',
  '2018-03': '3632 FRAZIER ST PITTSBURGH PA 15213',
  '2018-08': '3632 FRAZIER ST PITTSBURGH PA 15213'},
 'reg_date': datetime.datetime(1979, 10, 1, 0, 0),
 'reg_info': {'2017-11': {'party': 'D', 'status': 'A'},
  '2018-03': {'party': 'D', 'status': 'A'},
  '2018-08': {'party': 'D', 'status': 'A'}}}

In [64]:
isinstance(date_reg_str, basestring)

True

In [65]:
isinstance(date_reg_dp, basestring)

False

In [75]:
reg = date_reg_str

In [76]:
%%timeit
if(isinstance(reg, basestring)):
    t = dateparser.parse(reg)
else:
    t=reg

1000 loops, best of 3: 1.52 ms per loop


In [71]:
reg = date_reg_dp

In [73]:
%%timeit
if(isinstance(reg, basestring)):
    t = dateparser.parse(reg)
else:
    t=reg

The slowest run took 28.97 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 206 ns per loop


In [89]:
m={'reg':date_reg_str}

In [90]:
%%timeit
if(isinstance(m['reg'], basestring)):
    t = dateparser.parse(m['reg'])
    m['reg'] = t
else:
    t=m['reg']

The slowest run took 15407.61 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 236 ns per loop


In [95]:
all_vids=sorted(voter_map.keys())

In [99]:
for vid in voter_map:
    if(vid[10:12]!="02"):
        print "%s: (%s)-(%s)" % (vid,vid[0:9],vid[10:12])

001518948-10: (001518948)-(10)
021615725-63: (021615725)-(63)
020136060-63: (020136060)-(63)
020653345-25: (020653345)-(25)


In [100]:
for vid in voter_map:
    if(vid[10:12]!="02"):
        print "%s: (%s)-(%s)\n\t%r" % (vid,vid[0:9],vid[10:12],voter_map[vid])

001518948-10: (001518948)-(10)
	{'DOB': '10/31/1955', 'reg_date': '7/15/1980', 'addresses': {'2018-03': '155 DARLINGTON LN SEWICKLEY PA 15143', '2017-11': '155 DARLINGTON LN SEWICKLEY PA 15143'}, 'reg_info': {'2018-03': {'status': 'A', 'party': 'R'}, '2017-11': {'status': 'A', 'party': 'R'}}}
021615725-63: (021615725)-(63)
	{'DOB': '2/16/1989', 'reg_date': '9/28/2008', 'addresses': {'2018-03': '222 BUTTERNUT DR MC DONALD PA 15057', '2017-11': '222 BUTTERNUT DR MC DONALD PA 15057'}, 'reg_info': {'2018-03': {'status': 'I', 'party': 'NF'}, '2017-11': {'status': 'I', 'party': 'NF'}}}
020136060-63: (020136060)-(63)
	{'DOB': '10/6/1985', 'reg_date': '6/14/2004', 'addresses': {'2018-03': '706 WHEATLAND CIR BRIDGEVILLE PA 15017', '2017-11': '706 WHEATLAND CIR BRIDGEVILLE PA 15017'}, 'reg_info': {'2018-03': {'status': 'A', 'party': 'D'}, '2017-11': {'status': 'A', 'party': 'D'}}}
020653345-25: (020653345)-(25)
	{'DOB': '6/6/1987', 'reg_date': '1/19/2006', 'addresses': {'2018-03': '5483 YOUNGRID

In [101]:
prefixes = ['001518948','021615725','020136060','020653345']
for vid in voter_map:
    if(vid[0:9] in prefixes):
        print "%s: (%s)-(%s)\n\t%r" % (vid,vid[0:9],vid[10:12],voter_map[vid])

021615725-02: (021615725)-(02)
	{'DOB': '2/16/1989', 'reg_date': '9/28/2008', 'addresses': {'2018-08': '222 BUTTERNUT DR MC DONALD PA 15057'}, 'reg_info': {'2018-08': {'status': 'I', 'party': 'NF'}}}
001518948-10: (001518948)-(10)
	{'DOB': '10/31/1955', 'reg_date': '7/15/1980', 'addresses': {'2018-03': '155 DARLINGTON LN SEWICKLEY PA 15143', '2017-11': '155 DARLINGTON LN SEWICKLEY PA 15143'}, 'reg_info': {'2018-03': {'status': 'A', 'party': 'R'}, '2017-11': {'status': 'A', 'party': 'R'}}}
021615725-63: (021615725)-(63)
	{'DOB': '2/16/1989', 'reg_date': '9/28/2008', 'addresses': {'2018-03': '222 BUTTERNUT DR MC DONALD PA 15057', '2017-11': '222 BUTTERNUT DR MC DONALD PA 15057'}, 'reg_info': {'2018-03': {'status': 'I', 'party': 'NF'}, '2017-11': {'status': 'I', 'party': 'NF'}}}
020136060-63: (020136060)-(63)
	{'DOB': '10/6/1985', 'reg_date': '6/14/2004', 'addresses': {'2018-03': '706 WHEATLAND CIR BRIDGEVILLE PA 15017', '2017-11': '706 WHEATLAND CIR BRIDGEVILLE PA 15017'}, 'reg_info': {'

In [110]:
# Fixup the entries with non-conforming suffixes by merging with their original -02 identities
prefixes = ['001518948','021615725','020136060','020653345']
del_keys=[]
for vid in voter_map:
    if(vid[0:9] in prefixes and vid[10:12]!="02"):
        orig_vid = "%s-02"%(vid[0:9])
        for month_str in voter_map[vid]['addresses']:
            voter_map[orig_vid]['addresses'][month_str] = voter_map[vid]['addresses'][month_str]
        for month_str in voter_map[vid]['reg_info']:
            voter_map[orig_vid]['reg_info'][month_str] = voter_map[vid]['reg_info'][month_str]
        # Queue duplicate key for removal
        del_keys.append(vid)
        print "%s: %r" % (orig_vid,voter_map[orig_vid])
        

001518948-02: {'DOB': '10/31/1955', 'reg_date': datetime.datetime(1980, 7, 15, 0, 0), 'addresses': {'2018-08': '155 DARLINGTON LN SEWICKLEY PA 15143', '2018-03': '155 DARLINGTON LN SEWICKLEY PA 15143', '2017-11': '155 DARLINGTON LN SEWICKLEY PA 15143'}, 'reg_info': {'2018-08': {'status': 'A', 'party': 'R'}, '2018-03': {'status': 'A', 'party': 'R'}, '2017-11': {'status': 'A', 'party': 'R'}}}
021615725-02: {'DOB': '2/16/1989', 'reg_date': datetime.datetime(2008, 9, 28, 0, 0), 'addresses': {'2018-08': '222 BUTTERNUT DR MC DONALD PA 15057', '2018-03': '222 BUTTERNUT DR MC DONALD PA 15057', '2017-11': '222 BUTTERNUT DR MC DONALD PA 15057'}, 'reg_info': {'2018-08': {'status': 'I', 'party': 'NF'}, '2018-03': {'status': 'I', 'party': 'NF'}, '2017-11': {'status': 'I', 'party': 'NF'}}}
020136060-02: {'DOB': '10/6/1985', 'reg_date': datetime.datetime(2004, 6, 14, 0, 0), 'addresses': {'2018-08': '706 WHEATLAND CIR BRIDGEVILLE PA 15017', '2018-03': '706 WHEATLAND CIR BRIDGEVILLE PA 15017', '2017-11

In [111]:
del_keys

['001518948-10', '021615725-63', '020136060-63', '020653345-25']

In [112]:
for vid in del_keys:
    del voter_map[vid]

In [102]:
# Try to save voter_map originally created by 
import pickle
voter_map_file_path = 'voters/voter_map_17_18_18_b.pickle'

In [152]:
# Save out voter_map
with open(voter_map_file_path, 'wb') as handle:
    pickle.dump(voter_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [103]:
# Load in voter_map
with open(voter_map_file_path, 'rb') as handle:
    voter_map = pickle.load(handle)