In [119]:
# This converts a fixed width voter file from Allegheny County in 2005-02-cd, to the same format
# as the later files.
# See https://drive.google.com/file/d/1x3UZwL7gmPT7jLJ1dBx3lR0CODjdGLyW/view?usp=sharing for format info
import pandas as pd
import numpy as np
import math

pd.set_option('display.max_columns', 500)

In [30]:
fwidths = [1,6,3,9,15,
           12,1,1,3,5,
           1,6,3,2,2,
           4,1,1,7,6,
           4,2,2,1,6,
           25,5,24,12,
           2,2,2,3,2,
           25,4,2,8,3,
           2,2,12,90]
cnames = ['c0','c1','c2','c3','c4',
          'c5','c6','c7','c8','c9',
          'c10','c11','c12','c13','c14',
          'c15','c16','c17','c18','c19',
          'c20','c21','c22','c23','c24',
          'c25','c26','c27','c28','c29',
          'c30','c31','c32','c33','c34',
          'c35','c36','c37','c38','c39',
          'c40','c41','c42','c43'
         ]

# Modify these values for each new CD
# date_cd_burned is used for fixing up empty Date_Registered and Date_Last_Changed
file_name = 'voters/2005-02-cd/voter_file_020405.txt'
date_cd_burned = "2/7/2005"

df = pd.read_fwf(file_name, widths = fwidths,names = cnames,dtype=np.str)

In [40]:
# Process raw data into interesting column names
voters_addr_cols=['House__', 'HouseNoSuffix', 'StreetNameComplete' ,'City','Zip_Code']

p_colnames=['ID_Number','Voter_Status','Political_Party','Date_Last_Changed','Date_Of_Birth','Date_Registered','Vote_History']

In [41]:
# Format ID number as 9 char-'02'.  I don't know where the -02 comes from, but only 4 entries from 
# the 3 extracts from 2017-2018 differ from that pattern, and in each case they're clearly the same people
# as the other versions have for the matching first 10 chars of the ID with -02.
df['ID_Number'] = df['c3'].apply(lambda x: "%s-02"%(x[0:9]))

In [42]:
# Strip any leading zeroes from c9 to generate house # column HOUSE__
df['House__'] = df['c9'].apply(lambda x: x.lstrip('0'))

In [43]:
df['HouseNoSuffix'] = df['c10'].apply(lambda x: x.strip() if x!='nan' else None)

In [44]:
df['StreetNameComplete'] = df['c25'].apply(lambda x: x.strip() if x!='nan' else None)

In [45]:
df['City'] = df['c27'].apply(lambda x: x.strip())

In [46]:
df['Zip_Code'] = df['c26']

In [47]:
df['Voter_Status'] = df['c23']

In [48]:
df['Political_Party'] = df['c16']

In [59]:
# Find all the rows where 'c19' (proto 'Date_Of_Birth') is malformed.  
# At least for 2005-02, these were all 'nan'
len(df[~df['c19'].str.contains('^\d\d\d\d\d\d$')])

1

In [60]:
df[~df['c19'].str.contains('^\d\d\d\d\d\d$')]

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,Date_Last_Changed,Vote_History,ID_Number,House__,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Voter_Status,Political_Party,Date_Of_Birth,Date_Registered
879,,12805,10,1402317,FREDERICK,VALERIE,,,,918,,111111,125,2,0,0,D,,84,,495,0,1,A,0,FIRST AVE,15108,CORAOPOLIS PA,CORAOPOLIS,14,42,45,225,12,CORNELL,0,63,52,497,2,0,CORAOPOLIS,0000000000000000000000000000000000000000000000...,,01/28/2005,0000000000000000000000000000000000000000000000...,001402317-02,918,,FIRST AVE,CORAOPOLIS PA,15108,A,D,na/n/19,04/01/1995


In [61]:
# Don't find any record of this person later on.  Just set Date_Of_Birth arbitrarily.
default_DOB = '01/01/1987'

In [62]:
# DOB is 6 chars in Mo-Da-Yr format.  Can assume year is 19YY since voters are >= 18 yo and this data is from 2009
df['Date_Of_Birth'] = df['c19'].apply(lambda x: default_DOB if (x=='nan' or x=='') else "%s/%s/19%s"%(x[0:2],x[2:4],x[4:6]))

In [50]:
# 'Date_Registered' is 4 chars in Mo-Yr format.  Assume it's 20YY if YY<20
df['Date_Registered'] = df['c20'].apply(lambda x: None if x=='nan' else ("%s/01/19%s"%(x[0:2],x[2:4]) if int(x[2:4]>="20") else "%s/01/20%s"%(x[0:2],x[2:4])))

In [51]:
# Find all the rows where 'c1' (proto 'Date_Last_Changed') is malformed.  
# At least for 2005-02, these were all 'nan'
len(df[~df['c1'].str.contains('^\d\d\d\d\d\d$')])

30941

In [52]:
# 'Date_Last_Changed' is 6 chars in Mo-Da-Yr format.  Assume it's 20YY if YY<20
# In case it's 'nan', use the date of the CD.  See 'date_cd_burned' above.
# Be sure to change this date for processing other CDs
df['Date_Last_Changed'] = df['c1'].apply(lambda x: date_cd_burned if (x=='nan' or x=='') else "%s/%s/19%s"%(x[0:2],x[2:4],x[4:6]) if int(x[4:6])>=20 else "%s/%s/20%s"%(x[0:2],x[2:4],x[4:6]))

In [53]:
# Make sure it worked
df[~df['c1'].str.contains('^\d\d\d\d\d\d$')][['c1','Date_Last_Changed']][0:2]

Unnamed: 0,c1,Date_Last_Changed
2047,,2/7/2005
2049,,2/7/2005


In [54]:
# Check for malformed 'c42' (proto 'Vote_History' column)
df[~df['c42'].str.contains('^0000[012]+')]

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,Date_Last_Changed,Vote_History,ID_Number,House__,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Voter_Status,Political_Party,Date_Of_Birth,Date_Registered


In [200]:
# Check for only voted once in 'c42' (proto 'Vote_History' column)
df[df['c42'].str.contains('^00001[02]+$')]

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,Date_Last_Changed,Vote_History,ID_Number,House__,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Voter_Status,Political_Party,Date_Of_Birth,Date_Registered,Date_Registered_orig,del_vid,vm_reg_date_bad,vm_reg_date,dob_dr_mismatch
3218,,80901,10,1908830,WILLKOMM,RUTH,E,,105.0,8,,111111,116,0,2,0,D,F,84,80820,155,0,6,A,0,SCENERIDGE AVE,15227,PITTSBURGH PA,BRENTWOOD,18,42,36,218,8,BRENTWOOD,0,33,48,461,0,2,BRENTWOOD,0000122222222222222222222222222222222222222222...,,08/09/2001,1222222222222222222222222222222222222222222200...,001908830-02,8,,SCENERIDGE AVE,PITTSBURGH PA,15227,A,D,08/08/1920,01/01/1955,01/01/1955,False,False,01/01/1955,False
9084,,91100,10,1918216,GULLA,MARY,J,,,231,,111111,171,1,4,0,D,F,84,21621,956,0,8,A,0,SPRING RUN DR,15146,MONROEVILLE PA,MONROEVILLE,18,45,25,207,29,GATEWAY,0,51,199A,681,1,4,MONROEVILLE,0000122222222222222222222222222222222222222222...,,09/11/2000,1222222222222222222222222222222222222222222200...,001918216-02,231,,SPRING RUN DR,MONROEVILLE PA,15146,A,D,02/16/1921,09/01/1956,09/01/1956,False,False,,False
22043,,91100,10,1937688,KASPER,JOSEPH,,J,,803,,111111,222,0,5,0,N,M,84,32625,475,0,9,A,0,EASTMAN ST,15122,WEST MIFFLIN PA,WEST MIFFLIN,14,43,38,214,82,WEST MIFFLIN AREA,0,31,48,889,0,5,WEST MIFFLIN,0000122222222222222222222222222222222222222222...,,09/11/2000,1222222222222222222222222222222222222222222200...,001937688-02,803,,EASTMAN ST,WEST MIFFLIN PA,15122,A,N,03/26/1925,04/01/1975,04/01/1975,False,False,04/01/1975,False
103511,,92702,10,2060020,JACKSON,LESLIE,K,,,5341,,111111,188,10,17,0,D,F,84,120860,279,9,13,A,0,ROSETTA ST,15224,PITTSBURGH PA,PITTSBURGH,14,38,21,231,57,PITTSBURGH DISTRICT-3,700,31,129,753,10,17,PITTSBURGH,0000122222222222222222222222222222222222222222...,,09/27/2002,1222222222222222222222222222222222222222222200...,002060020-02,5341,,ROSETTA ST,PITTSBURGH PA,15224,A,D,12/08/1960,02/01/1979,02/01/1979,False,False,02/01/1979,False
106835,,91100,10,2064742,GOTTRON,MARLENE,M,,,795,,111111,179,6,1,0,N,F,84,30641,1281,0,8,A,0,DICHICCO DR,15137,NORTH VERSAILLES PA,N VERSAILLES,14,45,34,211,19,EAST ALLEGHENY-REGION 5,0,31,199A,713,6,1,N VERSAILLES,0000122222222222222222222222222222222222222222...,,09/11/2000,1222222222222222222222222222222222222222222200...,002064742-02,795,,DICHICCO DR,NORTH VERSAILLES PA,15137,A,N,03/06/1941,12/01/1981,12/01/1981,False,False,12/01/1981,False
176288,,91100,10,2171000,ESPOSITO,JOSEPH,G,,,6601,,111111,188,14,30,0,D,M,84,71115,473,5,11,A,0,LANDVIEW RD,15217,PITTSBURGH PA,PITTSBURGH,14,43,23,235,59,PITTSBURGH DISTRICT-5,1100,22,132,753,14,30,PITTSBURGH,0000122222222222222222222222222222222222222222...,,09/11/2000,1222222222222222222222222222222222222222222200...,002171000-02,6601,,LANDVIEW RD,PITTSBURGH PA,15217,A,D,07/11/1915,04/01/1973,04/01/1973,False,False,04/01/1973,False
181671,,70301,10,2178174,JORDAN,DONALD,,,,10,,111111,188,22,3,0,D,M,84,31549,1079,6,13,A,0,NORTH AVE E,15212,PITTSBURGH PA,PITTSBURGH,14,42,19,240,62,PITTSBURGH DISTRICT-8,2000,11,141,753,22,3,PITTSBURGH,0000122222222222222222222222222222222222222222...,,07/03/2001,1222222222222222222222222222222222222222222200...,002178174-02,10,,NORTH AVE E,PITTSBURGH PA,15212,A,D,03/15/1949,10/01/1979,10/01/1979,False,False,10/01/1979,False
243942,,80801,10,2273505,MILINSKI,ELIZABETH,J,,,340,,111111,104,0,12,0,R,F,84,71125,958,0,6,A,0,JOHN ST,15227,PITTSBURGH PA,BALDWIN BR,14,43,36,218,6,BALDWIN-WHITEHALL,0,32,48,413,0,12,BALDWIN BR,0000122222222222222222222222222222222222222222...,,08/08/2001,1222222222222222222222222222222222222222222200...,002273505-02,340,,JOHN ST,PITTSBURGH PA,15227,A,R,07/11/1925,09/01/1958,09/01/1958,False,False,09/01/1958,False
256968,,61902,10,2292168,MADDEN,RONALD,,,,327,,111111,188,6,3,0,D,M,84,51636,477,7,13,A,0,THIRTY-EIGHTH ST,15201,PITTSBURGH PA,PITTSBURGH,14,38,20,310,56,PITTSBURGH DISTRICT-2,400,11,199A,753,6,3,PITTSBURGH,0000122222222222222222222222222222222222222222...,,06/19/2002,1222222222222222222222222222222222222222222200...,002292168-02,327,,THIRTY-EIGHTH ST,PITTSBURGH PA,15201,A,D,05/16/1936,04/01/1977,04/01/1977,False,False,,False
259284,,83104,10,2295395,MCELROY,PATRICIA,A,,241.0,600,,111111,179,5,2,0,D,F,84,50532,380,0,8,A,0,LINCOLN HWY,15137,NORTH VERSAILLES PA,N VERSAILLES,14,45,35,211,23,EAST ALLEGHENY-REGION 9,0,31,199A,713,5,2,N VERSAILLES,0000122222222222222222222222222222222222222222...,,08/31/2004,1222222222222222222222222222222222222222222200...,002295395-02,600,,LINCOLN HWY,NORTH VERSAILLES PA,15137,A,D,05/05/1932,03/01/1980,03/01/1980,False,False,,False


In [55]:
# Create 'Vote_History' column by stripping away the first 4 filler characters 
# from 'c42'.  See 'Voter List Instructions' file in Google Drive Folder 
# (https://drive.google.com/drive/folders/1Dp79JX6LUp17gFAabPmJGuY1MP6TpKKk) for details
df['Vote_History'] = df['c42'].str.replace('^0000','',regex=True)

In [188]:
# Go deal with the "Additional cleanup below section and come back here when done"

In [189]:
# Clean up for saving
save_colnames = p_colnames + voters_addr_cols

In [190]:
df_out = df[save_colnames]

In [191]:
df_out_pathname = "voters/2005-02-cd/VOTERS02_07_2005.TXT"
df_out.to_csv(df_out_pathname,sep="\t")

In [None]:
# Additional cleanup below

In [56]:
df[['c3','ID_Number','c9','House__', 'c10','HouseNoSuffix','StreetNameComplete','City','Zip_Code','Voter_Status','c19','Date_Of_Birth','c20','Date_Registered','c1','Date_Last_Changed']][0:5]

Unnamed: 0,c3,ID_Number,c9,House__,c10,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Voter_Status,c19,Date_Of_Birth,c20,Date_Registered,c1,Date_Last_Changed
0,1002451,001002451-02,5847,5847,,,MERIDIAN RD,GIBSONIA PA,15044,A,32618,03/26/1918,151,01/01/1951,11205,01/12/2005
1,1015655,001015655-02,203,203,,,HUGEL DR,PITTSBURGH PA,15209,A,12243,01/22/1943,172,01/01/1972,101403,10/14/2003
2,1023473,001023473-02,3111,3111,,,SQUIRES MANOR LN,SOUTH PARK PA,15129,A,11170,01/11/1970,1096,10/01/1996,101204,10/12/2004
3,1024003,001024003-02,1927,1927,,,MURRAY AVE,PITTSBURGH PA,15217,A,21077,02/10/1977,1096,10/01/1996,100504,10/05/2004
4,1024171,001024171-02,106,106,,,LINDEN AVE S,PITTSBURGH PA,15208,A,90453,09/04/1953,1096,10/01/1996,40704,04/07/2004


In [39]:
df[0:5]

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,Date_Last_Changed,Vote_History
0,,11205,10,1002451,FAGER,ELISABETH,R,,110.0,5847,,111111,194,0,1,0,R,,84,32618,151,0,2,A,0,MERIDIAN RD,15044,GIBSONIA PA,RICHLAND,4,40,28,246,4,PINE-RICHLAND - REGION 2,1941,42,113,777,0,1,RICHLAND,0000000000000000000000000000000000000000000000...,,01/12/2005,0000000000000000000000000000000000000000000100...
1,,101403,10,1015655,EVANS,RICHARD,W,,,203,,111111,202,1,5,0,D,M,84,12243,172,0,3,A,0,HUGEL DR,15209,PITTSBURGH PA,SHALER,4,40,21,203,70,SHALER AREA - REGION 2,0,52,199A,809,1,5,SHALER,0000000000000000000000000000000000000000000001...,,10/14/2003,0000000000000000000000000000000000000000012100...
2,,101204,10,1023473,WOLFE,CHARLES,R,,,3111,,111111,205,0,9,0,R,M,84,11170,1096,0,6,A,0,SQUIRES MANOR LN,15129,SOUTH PARK PA,SOUTH PARK,18,37,39,216,74,SOUTH PARK,0,33,199A,821,0,9,SOUTH PARK,0000000000000000000000000000000000000000000000...,,10/12/2004,0000000000000000000000000000000000000000002100...
3,,100504,10,1024003,KLEIN,ERICH,M,,26.0,1927,,111111,188,14,34,0,N,M,84,21077,1096,5,11,A,0,MURRAY AVE,15217,PITTSBURGH PA,PITTSBURGH,14,43,23,235,58,PITTSBURGH DISTRICT-4,1100,22,199A,753,14,34,PITTSBURGH,0000000000000000000000000000000000000000000000...,,10/05/2004,0000000000000000000000000000000000000000000100...
4,,40704,10,1024171,MCNULTY,SALLY,B,,,106,,111111,188,14,10,0,D,,84,90453,1096,8,11,A,0,LINDEN AVE S,15208,PITTSBURGH PA,PITTSBURGH,14,43,23,235,55,PITTSBURGH DISTRICT-1,1200,22,132,753,14,10,PITTSBURGH,0000000000000000000000000000000000000000000000...,,04/07/2004,0000000000000000000000000000000000000000001100...


In [57]:
df[df['c10']!='nan'][['c10','HouseNoSuffix']][0:5]

Unnamed: 0,c10,HouseNoSuffix
116,J,J
170,5,5
263,D,D
288,5,5
329,5,5


In [85]:
# Sanity check 'Date_Registered' and 'Date_Of_Birth'.  Result should be empty
dob_dr_mismatch = pd.to_datetime(df['Date_Registered'])<pd.to_datetime(df['Date_Of_Birth'])
df[dob_dr_mismatch][['ID_Number','c19','Date_Of_Birth','c20','Date_Registered']]

Unnamed: 0,ID_Number,c19,Date_Of_Birth,c20,Date_Registered
712,001381155-02,060372,06/03/1972,0566,05/01/1966
2139,001907192-02,082546,08/25/1946,0846,08/01/1946
3185,001908780-02,101639,10/16/1939,0822,08/01/1922
8284,001916973-02,012599,01/25/1999,0949,09/01/1949
10702,001920986-02,061725,06/17/1925,0425,04/01/1925
12919,001924580-02,062243,06/22/1943,0634,06/01/1934
13477,001925489-02,041999,04/19/1999,0852,08/01/1952
13734,001925869-02,052628,05/26/1928,0822,08/01/1922
15340,001928263-02,112044,11/20/1944,1037,10/01/1937
15593,001928625-02,031227,03/12/1927,0920,09/01/1920


In [84]:
len(dob_dr_mismatch)

876786

In [66]:
# Sanity check 'Date_Registered' and 'Date_Of_Birth'.  Result should be empty
df[df['Date_Registered'].str.contains("na") | df['Date_Of_Birth'].str.contains("na")][['c19','Date_Of_Birth','c20','Date_Registered']]

Unnamed: 0,c19,Date_Of_Birth,c20,Date_Registered


In [67]:
# Try to load voter_map 
#   voter_map_17_18_18.pickle has 'reg_date' as a string
#   voter_map_17_18_18_b.pickle has 'reg_date' converted to datetime
#   voter_map_09_17_18_18_c.pickle has data from 2009-07
#   voter_map_09_17_18_18_d.pickle has census_block info (full for 15213, partial for all, need to do TODO addresses)
#   voter_map_09_17_18_18_e.pickle has census_block info for all
#   voter_map_09_17_18_18_f.pickle has census_block latlon info for all

import pickle
voter_map_file_path = 'voters/voter_map_09_17_18_18_f.pickle'

In [68]:
# Load in voter_map
with open(voter_map_file_path, 'rb') as handle:
    voter_map = pickle.load(handle)

In [69]:
no_reg_df = df[pd.isna(df['Date_Registered'])]

In [70]:
voter_map['020208175-02']['reg_date'].strftime('%m/%d/%Y')

'04/16/1972'

In [71]:
voter_map['020208175-02']

{'DOB': '4/6/1972',
 'addr_arr': [{'address': '2007 TERMON AVE PITTSBURGH PA 15212',
   'census_block': u'420032701002021',
   'date': datetime.datetime(1972, 4, 16, 0, 0),
   'latlon': <shapely.geometry.point.Point at 0x7f1e82fbd3d0>},
  {'address': '2005 TERMON AVE PITTSBURGH PA 15212',
   'census_block': u'420032701002021',
   'date': datetime.datetime(2017, 11, 30, 0, 0),
   'latlon': <shapely.geometry.point.Point at 0x7f1e82fbd490>}],
 'addresses': {'2009-07': '2007 TERMON AVE PITTSBURGH PA 15212',
  '2017-11': '2005 TERMON AVE PITTSBURGH PA 15212',
  '2018-03': '2005 TERMON AVE PITTSBURGH PA 15212',
  '2018-08': '2005 TERMON AVE PITTSBURGH PA 15212'},
 'reg_date': datetime.datetime(1972, 4, 16, 0, 0),
 'reg_info': {'2009-07': {'party': 'M', 'status': 'A'},
  '2017-11': {'party': 'D', 'status': 'A'},
  '2018-03': {'party': 'D', 'status': 'A'},
  '2018-08': {'party': 'D', 'status': 'A'}}}

In [86]:
df_ddm = df[dob_dr_mismatch].reset_index()

In [125]:
import dateparser
not_in_vid = set()

= set()

# Check each entry in dob_dr_mismatch against voter_map
for i in range(0,len(df_ddm)):
    vid = df_ddm['ID_Number'].iloc[i]
    dob_dr_mismatch_set.add(vid)
    if(vid in voter_map):
        s_reg_date = voter_map[vid]['reg_date'].strftime('%m/%d/%Y')
        print "%s: %10s %10s %5s %10s %10s %5s" % (vid,voter_map[vid]['DOB'], df_ddm['Date_Of_Birth'].iloc[i],
                                   'MATCH' if (dateparser.parse(voter_map[vid]['DOB']) == dateparser.parse(df_ddm['Date_Of_Birth'].iloc[i])) else '',
                                   s_reg_date, df_ddm['Date_Registered'].iloc[i], 
                                   'MATCH' if (voter_map[vid]['reg_date'] == dateparser.parse(df_ddm['Date_Registered'].iloc[i])) else ''
                                  )
    else:
        not_in_vid.add(vid)

for vid in not_in_vid:
    print "%s: %10s %10s %5s %10s %10s %5s" % (vid,'', df_ddm['Date_Of_Birth'].iloc[i],
                               '',
                               '', df_ddm['Date_Registered'].iloc[i], 
                               ''
                              )


001908780-02: 10/16/1939 10/16/1939 MATCH 08/01/1922 08/01/1922 MATCH
001920986-02: 06/17/1925 06/17/1925 MATCH 04/01/1925 04/01/1925 MATCH
001924580-02:  6/22/1943 06/22/1943 MATCH 06/01/1934 06/01/1934 MATCH
001925869-02: 05/26/1928 05/26/1928 MATCH 08/01/1922 08/01/1922 MATCH
001928263-02: 11/20/1944 11/20/1944 MATCH 10/01/1937 10/01/1937 MATCH
001928625-02: 03/12/1927 03/12/1927 MATCH 09/01/1920 09/01/1920 MATCH
001931915-02:  6/13/1973 06/13/1973 MATCH 07/01/1958 07/01/1958 MATCH
001957693-02: 11/12/1984 11/12/1984 MATCH 10/01/1984 10/01/1984 MATCH
002007374-02:   6/5/1899 06/05/1999       04/01/1999 04/01/1999 MATCH
002024372-02: 10/15/1978 10/15/1978 MATCH 08/01/1944 08/01/1944 MATCH
002032315-02:  3/15/1949 03/15/1949 MATCH 07/01/1939 07/01/1939 MATCH
002035252-02:  8/27/1937 08/27/1937 MATCH 08/01/1932 08/01/1932 MATCH
002047283-02:  3/28/1942 03/28/1942 MATCH 08/01/1923 08/01/1923 MATCH
002061624-02:  8/26/1983 08/26/1983 MATCH 04/01/1980 04/01/1980 MATCH
002107655-02:  9/21/

003047669-02: 12/23/1998 12/23/1998 MATCH 05/01/1996 05/01/1996 MATCH
003049257-02: 11/12/1973 11/12/1996       09/01/1996 09/01/1996 MATCH
003056588-02:   4/1/1979 09/01/1999       02/01/1998 02/01/1998 MATCH
004101054-02: 10/25/1970 10/25/1970 MATCH 10/25/1970 10/01/1970      
004744449-02: 02/26/1978 02/26/1978 MATCH 02/01/1961 02/01/1961 MATCH
004757117-02: 05/10/1980 05/10/1980 MATCH 05/01/1980 05/01/1980 MATCH
011736897-02:  11/1/1980 11/01/1980 MATCH 01/01/1979 01/01/1979 MATCH
013351098-02:   4/6/1973 04/06/1973 MATCH 04/01/1921 04/01/1921 MATCH
013586018-02:   2/3/1972 02/03/1972 MATCH 02/07/2005 09/01/1950      
020006730-02:  5/10/1979 05/10/1979 MATCH 05/10/1979 05/01/1979      
020045391-02: 06/12/1985 06/12/1985 MATCH 01/01/1985 01/01/1985 MATCH
020050543-02: 10/19/1983 10/19/1983 MATCH 10/01/1983 10/01/1983 MATCH
020078476-02:  3/15/1985 03/15/1985 MATCH 02/19/1985 02/01/1985      
020094327-02: 11/17/1968 11/17/1967       11/27/1967 11/01/1967      
020189355-02: 12/14/

In [88]:
df_ddm[df_ddm['ID_Number']=='013351098-02']

Unnamed: 0,index,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,Date_Last_Changed,Vote_History,ID_Number,House__,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Voter_Status,Political_Party,Date_Of_Birth,Date_Registered
203,781905,,91304,10,13351098,CASS,REBECCA,A,,,17,,111111,149,0,3,0,R,F,84,40673,421,0,12,A,0,POCONO DR,15220,PITTSBURGH PA,GREENTREE,18,42,42,222,36,KEYSTONE OAKS - REGION 3,0,41,16,593,0,3,GREENTREE,0000000000000000000000000000000000000000000000...,,09/13/2004,0000000000000000000000000000000000000000000100...,013351098-02,17,,POCONO DR,PITTSBURGH PA,15220,A,R,04/06/1973,04/01/1921


In [89]:
df_ddm[203:205]

Unnamed: 0,index,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,Date_Last_Changed,Vote_History,ID_Number,House__,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Voter_Status,Political_Party,Date_Of_Birth,Date_Registered
203,781905,,91304,10,13351098,CASS,REBECCA,A,,,17,,111111,149,0,3,0,R,F,84,40673,421,0,12,A,0,POCONO DR,15220,PITTSBURGH PA,GREENTREE,18,42,42,222,36,KEYSTONE OAKS - REGION 3,0,41,16,593,0,3,GREENTREE,0000000000000000000000000000000000000000000000...,,09/13/2004,0000000000000000000000000000000000000000000100...,013351098-02,17,,POCONO DR,PITTSBURGH PA,15220,A,R,04/06/1973,04/01/1921
204,782145,,81604,10,13586018,REED,KATHLEEN,A,,,1525,,111111,145,3,3,0,R,,84,20372,950,0,2,A,0,ROBERTSON RD,15237,PITTSBURGH PA,FRANKLIN PK,4,40,28,212,50,NORTH ALLEGHENY,0,11,199A,577,3,3,FRANKLIN PK,0000000000000000000000000000000000000000000000...,,08/16/2004,0000000000000000000000000000000000000000000100...,013586018-02,1525,,ROBERTSON RD,PITTSBURGH PA,15237,A,R,02/03/1972,09/01/1950


In [90]:
voter_map['013586018-02']

{'DOB': '2/3/1972',
 'addr_arr': [{'address': '1525 ROBERTSON RD PITTSBURGH PA 15237',
   'census_block': u'420034120021000',
   'date': datetime.datetime(1850, 9, 9, 0, 0),
   'latlon': <shapely.geometry.point.Point at 0x7f1f7325bd50>}],
 'addresses': {'2009-07': '1525 ROBERTSON RD PITTSBURGH PA 15237',
  '2017-11': '1525 ROBERTSON RD PITTSBURGH PA 15237',
  '2018-03': '1525 ROBERTSON RD PITTSBURGH PA 15237',
  '2018-08': '1525 ROBERTSON RD PITTSBURGH PA 15237'},
 'reg_date': datetime.datetime(1850, 9, 9, 0, 0),
 'reg_info': {'2009-07': {'party': 'R', 'status': 'A'},
  '2017-11': {'party': 'R', 'status': 'A'},
  '2018-03': {'party': 'R', 'status': 'A'},
  '2018-08': {'party': 'R', 'status': 'A'}}}

In [92]:
# Patch up clearly broken registration date for '013586018-02' to be date_cd_burned
dcb = dateparser.parse(date_cd_burned)
voter_map['013586018-02']['reg_date'] = dcb
voter_map['013586018-02']['addr_arr'][0]['date']=dcb

In [135]:
from datetime import timedelta
td_17yr = timedelta(days=(365*17))
td_17yr

datetime.timedelta(6205)

In [132]:
reg_td = voter_map['013586018-02']['reg_date'] - dateparser.parse(voter_map['013586018-02']['DOB'])
reg_td.days/365

33

In [150]:
len(bad_reg_date_set)

63

In [151]:
df_ddm[df_ddm['ID_Number']=='001908780-02']

Unnamed: 0,index,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,Date_Last_Changed,Vote_History,ID_Number,House__,HouseNoSuffix,StreetNameComplete,City,Zip_Code,Voter_Status,Political_Party,Date_Of_Birth,Date_Registered
2,3185,,,10,1908780,RUDMAN,JOSEPH,,,,1260,,111111,114,0,2,0,D,M,84,101639,822,0,8,A,0,BRINTON RD,15221,PITTSBURGH PA,BRADDOCK HL,14,43,34,209,44,WOODLAND HILLS - REG. 4,0,43,121,453,0,2,BRADDOCK HL,0000111111111111111111112111112111111211111111...,,2/7/2005,1111111111111111111121111121111112111111112100...,001908780-02,1260,,BRINTON RD,PITTSBURGH PA,15221,A,D,10/16/1939,08/01/1922


In [154]:
import math
bad_reg_date_set=set()

#for i in range(0,len(no_reg_df)):
for i in range(0,len(df_ddm)):
    vid = df_ddm['ID_Number'].iloc[i]
    bad_reg_date = True
    if(not vid in voter_map):
        print "No known reg_date for %s [%d]" %(vid,i)
    else:
        dob = dateparser.parse(voter_map[vid]['DOB'])
        reg_td=voter_map[vid]['reg_date']-dob
        if(reg_td>td_17yr):
            bad_reg_date=False
        print "%s: Have reg_date for %s [%d] = %s (DOB=%s, age=%d)" %(
            'BAD' if bad_reg_date else 'OK ',
            vid,i,voter_map[vid]['reg_date'],voter_map[vid]['DOB'],reg_td.days/365)
    if(bad_reg_date):
        bad_reg_date_set.add(vid)
        
    # Get voting history detail
    vh_str = df_ddm['c42'].iloc[i]        
    # Find first instance of 1 or 2
    vi = vh_str.find('1')
    di = vh_str.find('2')
    # If this person has never voted then both vi and di will be -1.  In that case 
    # set reg_date to 2004 (quite a few people with no voting record were born in 1991)
    fv_year=None
    if(vi == -1 and di==-1):
        min_i=-1
        fv_year = 2004
    elif(vi==-1):
        vi=di
    elif(di==-1):
        di=vi
        
    if(not fv_year):
        min_i = min(vi,di)
        # indices 0-3 are filler and are all zeros
        # index 4-5 are 1983, 6-7 are 1984, etc
        fv_year = math.floor((min_i-4)/2)+1983
        
    print "\tFirst non-zero vh = %d (%d)" % (min_i, fv_year)


No known reg_date for 001381155-02 [0]
	First non-zero vh = -1 (2004)
No known reg_date for 001907192-02 [1]
	First non-zero vh = 4 (1983)
BAD: Have reg_date for 001908780-02 [2] = 1922-08-01 00:00:00 (DOB=10/16/1939, age=-18)
	First non-zero vh = 4 (1983)
No known reg_date for 001916973-02 [3]
	First non-zero vh = 4 (1983)
BAD: Have reg_date for 001920986-02 [4] = 1925-04-01 00:00:00 (DOB=06/17/1925, age=-1)
	First non-zero vh = 4 (1983)
BAD: Have reg_date for 001924580-02 [5] = 1934-06-01 00:00:00 (DOB=6/22/1943, age=-10)
	First non-zero vh = 4 (1983)
No known reg_date for 001925489-02 [6]
	First non-zero vh = 4 (1983)
BAD: Have reg_date for 001925869-02 [7] = 1922-08-01 00:00:00 (DOB=05/26/1928, age=-6)
	First non-zero vh = 4 (1983)
BAD: Have reg_date for 001928263-02 [8] = 1937-10-01 00:00:00 (DOB=11/20/1944, age=-8)
	First non-zero vh = 4 (1983)
BAD: Have reg_date for 001928625-02 [9] = 1920-09-01 00:00:00 (DOB=03/12/1927, age=-7)
	First non-zero vh = 4 (1983)
BAD: Have reg_date f

BAD: Have reg_date for 002508899-02 [109] = 1944-09-01 00:00:00 (DOB=08/23/1952, age=-8)
	First non-zero vh = 4 (1983)
BAD: Have reg_date for 002508908-02 [110] = 1934-09-01 00:00:00 (DOB=2/14/1943, age=-9)
	First non-zero vh = 4 (1983)
No known reg_date for 002509635-02 [111]
	First non-zero vh = 4 (1983)
OK : Have reg_date for 002509728-02 [112] = 1932-02-01 00:00:00 (DOB=10/28/1899, age=32)
	First non-zero vh = 4 (1983)
OK : Have reg_date for 002510346-02 [113] = 1957-09-01 00:00:00 (DOB=08/31/1933, age=24)
	First non-zero vh = 4 (1983)
BAD: Have reg_date for 002511740-02 [114] = 1937-03-01 00:00:00 (DOB=07/13/1939, age=-3)
	First non-zero vh = 4 (1983)
No known reg_date for 002517678-02 [115]
	First non-zero vh = 4 (1983)
BAD: Have reg_date for 002525693-02 [116] = 1976-10-01 00:00:00 (DOB=10/26/1976, age=-1)
	First non-zero vh = 4 (1983)
BAD: Have reg_date for 002529045-02 [117] = 1979-10-01 00:00:00 (DOB=11/9/1979, age=-1)
	First non-zero vh = 4 (1983)
BAD: Have reg_date for 0025

BAD: Have reg_date for 020193284-02 [218] = 1985-08-13 00:00:00 (DOB=8/3/1985, age=0)
	First non-zero vh = 47 (2004)
BAD: Have reg_date for 020193733-02 [219] = 1965-08-12 00:00:00 (DOB=8/2/1965, age=0)
	First non-zero vh = 47 (2004)
BAD: Have reg_date for 020193967-02 [220] = 1985-09-01 00:00:00 (DOB=09/06/1985, age=-1)
	First non-zero vh = 47 (2004)
No known reg_date for 020194239-02 [221]
	First non-zero vh = 47 (2004)
BAD: Have reg_date for 020199681-02 [222] = 1984-06-19 00:00:00 (DOB=10/3/1984, age=-1)
	First non-zero vh = 47 (2004)
BAD: Have reg_date for 020208175-02 [223] = 1972-04-16 00:00:00 (DOB=4/6/1972, age=0)
	First non-zero vh = 47 (2004)


In [174]:
use_voter_map_reg_date_set = set(['002023120-02', '002023124-02','013586018-02','002157833-02'])
del_vid_set=set(['002965596-02', '003018966-02', '003035174-02'])

In [106]:
# Remove bad vids from voter_map and df
for vid in del_vid_set:
    if(vid in voter_map):
        del voter_map[vid]
        
df['del_vid'] = df[['ID_Number']].apply(lambda x: x[0] in del_vid_set,axis=1)

In [194]:
df[df['del_vid']]

Series([], Name: Vote_History, dtype: object)

In [108]:
df = df[~df['del_vid']].copy()

In [114]:
# Check if we still have any pre-1900 registration dates in voter_map
date_1900=dateparser.parse("1/1/1900")
pre_1900_set = set()

for vid in voter_map.keys():
    if(voter_map[vid]['reg_date']<date_1900):
        print "%s: DOB=%s, reg_date=%r" % (vid, voter_map[vid]['DOB'], voter_map[vid]['reg_date'])
        pre_1900_set.add(vid)
    

013702487-02: DOB=5/14/1943, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
013702490-02: DOB=4/30/1944, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
013704819-02: DOB=5/24/1944, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
013700904-02: DOB=10/16/1944, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
005349657-02: DOB=7/1/1941, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
013703987-02: DOB=8/8/1946, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
004153846-02: DOB=8/29/1947, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
013577559-02: DOB=8/3/1972, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
005484528-02: DOB=5/11/1969, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
004751175-02: DOB=4/25/1979, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
013845748-02: DOB=10/7/1952, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
013701649-02: DOB=11/6/1947, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
011009132-02: DOB=4/29/1949, reg_date=datetime.datetime(1800, 1, 1, 0, 0)
005522958-02: DOB=10/29/1982, reg_date=d

In [115]:
df['vm_reg_date_bad'] = df[['ID_Number']].apply(lambda x: x[0] in pre_1900_set,axis=1)

In [122]:
df[df['vm_reg_date_bad']][['ID_Number','c19','Date_Of_Birth','c20','Date_Registered']]

Unnamed: 0,ID_Number,c19,Date_Of_Birth,c20,Date_Registered
780499,005349657-02,70141,07/01/1941,100.0,01/01/2004
780500,005377644-02,120340,12/03/1940,100.0,01/01/2004
780501,005378289-02,12650,01/26/1950,100.0,01/01/2004
780504,005439320-02,121372,12/13/1972,100.0,01/01/2004
780505,005440076-02,51471,05/14/1971,100.0,01/01/2004
780507,005442143-02,12474,01/24/1974,100.0,01/01/2004
780510,005461141-02,21542,02/15/1942,100.0,01/01/2004
780511,005464892-02,82463,08/24/1963,100.0,01/01/2004
780514,005480278-02,41557,04/15/1957,100.0,01/01/2004
780515,005484528-02,51169,05/11/1969,100.0,01/01/2004


In [183]:
voter_map[list(pre_1900_set)[0]]

{'DOB': '4/3/1955',
 'addr_arr': [{'address': '300 WEXFORD BAYNE RD WEXFORD PA 15090',
   'census_block': u'420034090004029',
   'date': datetime.datetime(2004, 1, 1, 0, 0),
   'latlon': <shapely.geometry.point.Point at 0x7f1f0801a690>}],
 'addresses': {'2017-11': '300 WEXFORD BAYNE RD WEXFORD PA 15090',
  '2018-03': '300 WEXFORD BAYNE RD WEXFORD PA 15090',
  '2018-08': '300 WEXFORD BAYNE RD WEXFORD PA 15090'},
 'reg_date': datetime.datetime(2004, 1, 1, 0, 0),
 'reg_info': {'2017-11': {'party': 'R', 'status': 'A'},
  '2018-03': {'party': 'R', 'status': 'A'},
  '2018-08': {'party': 'R', 'status': 'A'}}}

In [182]:
# Patch up pre-1900 registration dates in voter_map to be 01/01/2004
default_reg_date=dateparser.parse("01/01/2004")

for vid in pre_1900_set:
    if(voter_map[vid]['reg_date']<date_1900):
        print "%s: DOB=%s, reg_date=%r" % (vid, voter_map[vid]['DOB'], voter_map[vid]['reg_date'])
        voter_map[vid]['reg_date']=default_reg_date
        voter_map[vid]['addr_arr'][0]['date']=default_reg_date

In [173]:
'002157833-02' in bad_reg_date_set

False

In [139]:
# Given the value of 'ID_Number', original value of 'Date_Registered', and 'c42' (voting history), return a new value for 'Date_Registered'
def vid_to_reg_date(vid, reg_date_init, vh_str):
    # If reg_date_init is non-null, return it
    # unless it is either in use_voter_map_reg_date_set or pre_1900_set
    if(reg_date_init and not vid in use_voter_map_reg_date_set and 
       not vid in pre_1900_set and not vid in bad_reg_date_set):
        return reg_date_init
    # Don't have it.  Check to see if vid is in voter_map.  If so, return reg_date
    # unless it is either in pre_1900_set
    if(vid in voter_map and not vid in pre_1900_set and not vid in bad_reg_date_set):
        return voter_map[vid]['reg_date'].strftime('%m/%d/%Y')
    
    # Estimate registration date from voting history detail
    # Find first instance of 1 or 2
    vi = vh_str.find('1')
    di = vh_str.find('2')
    # If this person has never voted then both vi and di will be -1.  In that case 
    # set reg_date to 07/01/2009 (quite a few people with no voting record were born in 1991)
    fv_year=None
    if(vi == -1 and di==-1):
        return("07/01/2009")
    elif(vi==-1):
        vi=di
    elif(di==-1):
        di=vi
        
    min_i = min(vi,di)
    # indices 0-3 are filler and are all zeros
    # index 4-5 are 1983, 6-7 are 1984, etc
    fv_year = int(math.floor((min_i-4)/2)+1983)
    return("01/01/{:04d}".format(fv_year))

In [110]:
# Clean up 
df['Date_Registered_orig']=df['Date_Registered']

In [175]:
df['Date_Registered']=df[['ID_Number','Date_Registered','c42']].apply(lambda x: vid_to_reg_date(x[0],x[1],x[2]),axis=1)

In [146]:
df['vm_reg_date']=df[['ID_Number']].apply(lambda x: voter_map[x[0]]['reg_date'].strftime('%m/%d/%Y') if x[0] in voter_map else None,axis=1)

In [170]:
df['dob_dr_mismatch']=df[['ID_Number']].apply(lambda x: x[0] in dob_dr_mismatch_set,axis=1)

In [157]:
'001908780-02' in bad_reg_date_set

True

In [148]:
vid = '001908780-02'
voter_map[vid]['reg_date']

datetime.datetime(1922, 8, 1, 0, 0)

In [149]:
dob = dateparser.parse(voter_map[vid]['DOB'])
reg_td=voter_map[vid]['reg_date']-dob
reg_td>td_17yr

False

In [177]:
# Figure out which voter_map entries should have the reg_date replaced with the 
# one we've calculated here
df_replace_vm_dr = df[df['dob_dr_mismatch'] & (~pd.isna(df['vm_reg_date'])) & (df['vm_reg_date'].str.replace('/\d\d/','')!=df['Date_Registered'].str.replace('/\d\d/',''))][['ID_Number','c19','Date_Of_Birth','c20','Date_Registered','Date_Registered_orig','vm_reg_date']]
df_replace_vm_dr

Unnamed: 0,ID_Number,c19,Date_Of_Birth,c20,Date_Registered,Date_Registered_orig,vm_reg_date
3185,001908780-02,101639,10/16/1939,0822,01/01/1983,08/01/1922,08/01/1922
10702,001920986-02,061725,06/17/1925,0425,01/01/1983,04/01/1925,04/01/1925
12919,001924580-02,062243,06/22/1943,0634,01/01/1983,06/01/1934,06/01/1934
13734,001925869-02,052628,05/26/1928,0822,01/01/1983,08/01/1922,08/01/1922
15340,001928263-02,112044,11/20/1944,1037,01/01/1983,10/01/1937,10/01/1937
15593,001928625-02,031227,03/12/1927,0920,01/01/1983,09/01/1920,09/01/1920
17627,001931915-02,061373,06/13/1973,0758,01/01/1983,07/01/1958,07/01/1958
35677,001957693-02,111284,11/12/1984,1084,01/01/1983,10/01/1984,10/01/1984
79666,002024372-02,101578,10/15/1978,0844,01/01/1983,08/01/1944,08/01/1944
84722,002032315-02,031549,03/15/1949,0739,01/01/1983,07/01/1939,07/01/1939


In [184]:
# Replace voter_map entries
for i in range(0,len(df_replace_vm_dr)):
    vid = df_replace_vm_dr['ID_Number'].iloc[i]
    new_reg_date = dateparser.parse(df_replace_vm_dr['Date_Registered'].iloc[i])
    voter_map[vid]['reg_date']=new_reg_date
    voter_map[vid]['addr_arr'][0]['date']=new_reg_date


In [185]:
all_vids=sorted(voter_map.keys())

In [187]:
for vid in voter_map:
    if(vid[10:12]!="02"):
        print "%s: (%s)-(%s)\n\t%r" % (vid,vid[0:9],vid[10:12],voter_map[vid])

In [192]:
# Try to save voter_map 
#   voter_map_17_18_18.pickle has 'reg_date' as a string
#   voter_map_17_18_18_b.pickle has 'reg_date' converted to datetime
#   voter_map_09_17_18_18_c.pickle has data from 2009-07
#   voter_map_09_17_18_18_d.pickle has census_block info (full for 15213, partial for all, need to do TODO addresses)
#   voter_map_09_17_18_18_e.pickle has census_block info for all
#   voter_map_09_17_18_18_f.pickle has census_block latlon info for all
#   voter_map_09_17_18_18_g.pickle has fixups from processing of 2005-02 (delete pre-1900 DOB, deal with Date_Registered < DOB+18yrs)

import pickle
voter_map_file_path = 'voters/voter_map_09_17_18_18_g.pickle'


In [193]:
# Save out voter_map
with open(voter_map_file_path, 'wb') as handle:
    pickle.dump(voter_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Debug

In [207]:
df[df['ID_Number']=='001934894-02'].iloc[0]

c0                                                                    nan
c1                                                                 020796
c2                                                                    010
c3                                                              001934894
c4                                                                NOWACKI
c5                                                                BARBARA
c6                                                                      A
c7                                                                    nan
c8                                                                    nan
c9                                                                  00261
c10                                                                   nan
c11                                                                111111
c12                                                                   173
c13                                   

In [202]:
%%time
df['DR'] = pd.to_datetime(df['Date_Registered'])

CPU times: user 1min 43s, sys: 0 ns, total: 1min 43s
Wall time: 1min 43s


In [None]:
%%time
df['DOB'] = df['Date_Of_Birth'].map(dateparser.parse)