# Cleaning

In [3]:
# Importing necessary modules
import seaborn as sns
import pyspark.sql.functions as F
import pandas as pd
import matplotlib.pyplot as plt
from operator import add
from functools import reduce
import numpy as np
import re
import os
from pyspark.sql.types import StructField, StructType, StringType, LongType, FloatType
from pyspark.sql.functions import *
import random
from pyspark.ml.feature import StandardScaler, VectorAssembler, Imputer, StringIndexer
from pyspark.ml.functions import vector_to_array
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import RFormula
import time

# Setting up visualization
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [4]:
cols_to_keep = [
    "Voters_Gender", # cat
#     "Voters_Age", # num
    "Voters_BirthDate", # ignore
    "Residence_Families_HHCount", # num
    "Residence_HHGender_Description", # cat
    "Mailing_Families_HHCount", # num
    "Mailing_HHGender_Description", # cat

#   !! voter party affiliation
    "Parties_Description", 
    
    # cat
    "CommercialData_PropertyType",
    "AddressDistricts_Change_Changed_CD",
    "AddressDistricts_Change_Changed_SD",
    "AddressDistricts_Change_Changed_HD",
    "AddressDistricts_Change_Changed_County",
    
    "Residence_Addresses_Density", # num
    
    # cat
    "CommercialData_EstimatedHHIncome",
    "CommercialData_ISPSA",
    # num
    "CommercialData_AreaMedianEducationYears",
    "CommercialData_AreaMedianHousingValue",
#    "CommercialData_MosaicZ4Global",
    # cat
     "CommercialData_AreaPcntHHMarriedCoupleNoChild",  
     "CommercialData_AreaPcntHHMarriedCoupleWithChild",
     "CommercialData_AreaPcntHHSpanishSpeaking",
     "CommercialData_AreaPcntHHWithChildren",
     "CommercialData_StateIncomeDecile",
#    "Ethnic_Description",
    "EthnicGroups_EthnicGroup1Desc",
    "CommercialData_DwellingType",
    "CommercialData_PresenceOfChildrenCode",
#    "CommercialData_PresenceOfPremCredCrdInHome", ## too many missing
    "CommercialData_DonatesToCharityInHome",
    "CommercialData_DwellingUnitSize",
    "CommercialData_ComputerOwnerInHome",
    "CommercialData_DonatesEnvironmentCauseInHome",
    "CommercialData_Education",
    
#   Don't include because of lookahead bias  
#     "Voters_VotingPerformanceEvenYearGeneral",
#     "Voters_VotingPerformanceEvenYearPrimary",
#     "Voters_VotingPerformanceEvenYearGeneralAndPrimary",
#     "Voters_VotingPerformanceMinorElection",
    
#   Other control variables that expect to be highly associated with outcome:
#     "ElectionReturns_P08CountyTurnoutAllRegisteredVoters",
#     "ElectionReturns_P08CountyTurnoutDemocrats",
#     "ElectionReturns_P08CountyTurnoutRepublicans",
    "General_2000",
    "General_2004",
    "PresidentialPrimary_2000",
    "PresidentialPrimary_2004",
        
#   Outcome variable (indiana law happens in 2005, approved by SCOTUS before presidential election in 2008)
    "General_2008"
]

In [5]:
# These are the states that do not have strict voter ID laws:
#  'VM2Uniform--CA--2021-05-02',	VM2Uniform--CA--2021-05-02	CA	x	California
#  'VM2Uniform--IL--2021-03-05',	VM2Uniform--IL--2021-03-05	IL	x	Illinois
#  'VM2Uniform--MA--2021-01-19',	VM2Uniform--MA--2021-01-19	MA	x	Massachusetts
#  'VM2Uniform--MD--2021-02-15',	VM2Uniform--MD--2021-02-15	MD	x	Maryland
#  'VM2Uniform--ME--2021-05-28',	VM2Uniform--ME--2021-05-28	ME	x	Maine
#  'VM2Uniform--MN--2021-02-14',	VM2Uniform--MN--2021-02-14	MN	x	Minnesota
#  'VM2Uniform--NC--2021-05-18',	VM2Uniform--NC--2021-05-18	NC	x	North Carolina
#  'VM2Uniform--NE--2021-01-20',	VM2Uniform--NE--2021-01-20	NE	x	Nebraska
#  'VM2Uniform--NJ--2021-03-11',	VM2Uniform--NJ--2021-03-11	NJ	x	New Jersey
#  'VM2Uniform--NM--2021-02-25',	VM2Uniform--NM--2021-02-25	NM	x	New Mexico
#  'VM2Uniform--NV--2021-06-13',	VM2Uniform--NV--2021-06-13	NV	x	Nevada
#  'VM2Uniform--NY--2021-03-15',	VM2Uniform--NY--2021-03-15	NY	x	New York
#  'VM2Uniform--OR--2021-02-05',	VM2Uniform--OR--2021-02-05	OR	x	Oregon
#  'VM2Uniform--PA--2021-05-20',	VM2Uniform--PA--2021-05-20	PA	x	Pennsylvania
#  'VM2Uniform--VT--2021-05-28',	VM2Uniform--VT--2021-05-28	VT	x	Vermont

# For each of these states, I want to pull enough samples to get a total sample of 1/2 M; can increase later

# grab files
states =  [
# For now, just exclude New York and Califonria, because the parquet files take too long to read
'VM2Uniform--VT--2021-05-28', 
'VM2Uniform--IL--2021-03-05',
'VM2Uniform--MA--2021-01-19',
'VM2Uniform--MD--2021-02-15',
'VM2Uniform--ME--2021-05-28',
'VM2Uniform--MN--2021-02-14',
'VM2Uniform--NC--2021-05-18',
'VM2Uniform--NE--2021-01-20',
'VM2Uniform--NJ--2021-03-11',
'VM2Uniform--NM--2021-02-25',
'VM2Uniform--NV--2021-06-13',
'VM2Uniform--OR--2021-02-05',
'VM2Uniform--PA--2021-05-20',
'VM2Uniform--CA--2021-05-02',
'VM2Uniform--NY--2021-03-15',
]

# bucket file path for all state parquet files
gcs_path = 'gs://pstat135-voter-file/VM2Uniform'

# create list of state abbreviations
pattern = re.compile(r"(?<=--)[A-Z]{2}")
state_abvs = re.findall(pattern, ''.join(states))

# do first iteration
print('VM2Uniform--VT--2021-05-28')

# num_per_state = 500

df_ref = spark.read.parquet("/".join([gcs_path, 'VM2Uniform--VT--2021-05-28']))
df_ref = df_ref.select(cols_to_keep)

numrows = {'VM2Uniform--VT--2021-05-28': df_ref.count()}

print("%d" % (numrows['VM2Uniform--VT--2021-05-28']))
    
# percentage_sample = num_per_state / numrows['VM2Uniform--VT--2021-05-28']
    
# df_ref = df_ref.sample(True, percentage_sample, seed = 19480384)
df_ref = df_ref.withColumn('STATE', F.lit(state_abvs[0]))
 
next_states = states[1:]

# do the rest of the iterations
for i, one_state in enumerate(next_states):

    print("%s: " % (one_state), end="")
    
    # read dataframe for one_state
    tmp_ref = spark.read.parquet("/".join([gcs_path, one_state]))
    tmp_ref = tmp_ref.select(cols_to_keep)
    numrows[one_state] = tmp_ref.count()
    print("%d" % (numrows[one_state]))
    
#     percentage_sample = num_per_state / numrows[one_state]
    
#     tmp_ref = tmp_ref.sample(True, percentage_sample, seed = 19480384)
    tmp_ref = tmp_ref.withColumn('STATE', F.lit(state_abvs[i+1]))
    
    df_ref = df_ref.union(tmp_ref)      

df_ref.printSchema()
df_ref.count()

VM2Uniform--VT--2021-05-28


                                                                                

463261
VM2Uniform--IL--2021-03-05: 

                                                                                

8336875
VM2Uniform--MA--2021-01-19: 

                                                                                

4572639
VM2Uniform--MD--2021-02-15: 

                                                                                

4110570
VM2Uniform--ME--2021-05-28: 

                                                                                

1040452
VM2Uniform--MN--2021-02-14: 

                                                                                

3563623
VM2Uniform--NC--2021-05-18: 

                                                                                

6616158
VM2Uniform--NE--2021-01-20: 

                                                                                

1201308
VM2Uniform--NJ--2021-03-11: 

                                                                                

6109844
VM2Uniform--NM--2021-02-25: 

                                                                                

1205712
VM2Uniform--NV--2021-06-13: 

                                                                                

1866442
VM2Uniform--OR--2021-02-05: 

                                                                                

3166785
VM2Uniform--PA--2021-05-20: 

                                                                                

8359764
VM2Uniform--CA--2021-05-02: 

                                                                                

21779518
VM2Uniform--NY--2021-03-15: 

                                                                                

12570650
root
 |-- Voters_Gender: string (nullable = true)
 |-- Voters_BirthDate: string (nullable = true)
 |-- Residence_Families_HHCount: string (nullable = true)
 |-- Residence_HHGender_Description: string (nullable = true)
 |-- Mailing_Families_HHCount: string (nullable = true)
 |-- Mailing_HHGender_Description: string (nullable = true)
 |-- Parties_Description: string (nullable = true)
 |-- CommercialData_PropertyType: string (nullable = true)
 |-- AddressDistricts_Change_Changed_CD: string (nullable = true)
 |-- AddressDistricts_Change_Changed_SD: string (nullable = true)
 |-- AddressDistricts_Change_Changed_HD: string (nullable = true)
 |-- AddressDistricts_Change_Changed_County: string (nullable = true)
 |-- Residence_Addresses_Density: string (nullable = true)
 |-- CommercialData_EstimatedHHIncome: string (nullable = true)
 |-- CommercialData_ISPSA: string (nullable = true)
 |-- CommercialData_AreaMedianEducationYears: string (nullable = true)
 |-- CommercialData_AreaMedianHou

                                                                                

84963601

### FUNCTIONS TO CLEAN DATASET

In [6]:
def clean_numeric_categorical(input_df: DataFrame) -> DataFrame:
    
    # remove special symbols ($, %) from relevant columns
    input_df = input_df.withColumn(
        "CommercialData_AreaMedianHousingValue",
        F.expr("substring(CommercialData_AreaMedianHousingValue, 2, length(CommercialData_AreaMedianHousingValue))"))

    pct = ["CommercialData_AreaPcntHHMarriedCoupleNoChild",  
           "CommercialData_AreaPcntHHMarriedCoupleWithChild",
           "CommercialData_AreaPcntHHSpanishSpeaking",
           "CommercialData_AreaPcntHHWithChildren"]

    for c in pct:
        input_df = input_df.withColumn(
            c,
            F.expr(f"substring({c}, 1, length({c})-1)")
        )
    input_df.select(["CommercialData_AreaMedianHousingValue"]+pct).show()

    numeric_cols = [
        'Residence_Families_HHCount',
        'Mailing_Families_HHCount',
        'Residence_Addresses_Density',
        "CommercialData_AreaMedianEducationYears",
        "CommercialData_AreaMedianHousingValue"
    ] + pct

    trinary_cols = [
        'CommercialData_DonatesToCharityInHome',
        'CommercialData_ComputerOwnerInHome',
        'CommercialData_DonatesEnvironmentCauseInHome'
    ]

    binary_cols = []

    dont_touch_cols = [
        "General_2008", 
        "Voters_BirthDate", 
        "General_2000",
        "General_2004",
        "PresidentialPrimary_2000",
        "PresidentialPrimary_2004"
    ]
    
    other_cols = [c for c in input_df.columns if c not in dont_touch_cols]
    other_cols = [c for c in other_cols if c not in (numeric_cols + trinary_cols + binary_cols)]

    categorical_cols = other_cols + binary_cols + trinary_cols

    for c in numeric_cols:
        input_df = input_df.withColumn(c, F.col(c).cast("float").alias(c))
    input_df = input_df.fillna("U", subset= trinary_cols)
    input_df = input_df.fillna("Missing", subset = other_cols)
#   input_df = input_df.fillna("N", subset = binary_cols)
    
    return input_df

In [7]:
df_ref = clean_numeric_categorical(df_ref)
df_ref.printSchema()

                                                                                

+-------------------------------------+---------------------------------------------+-----------------------------------------------+----------------------------------------+-------------------------------------+
|CommercialData_AreaMedianHousingValue|CommercialData_AreaPcntHHMarriedCoupleNoChild|CommercialData_AreaPcntHHMarriedCoupleWithChild|CommercialData_AreaPcntHHSpanishSpeaking|CommercialData_AreaPcntHHWithChildren|
+-------------------------------------+---------------------------------------------+-----------------------------------------------+----------------------------------------+-------------------------------------+
|                               223238|                                           39|                                             20|                                       1|                                   29|
|                               223238|                                           39|                                             20|               

In [8]:
def impute_values_function(input_df: DataFrame) -> DataFrame:

    # Create copy of working df
    input_df = input_df.alias('input_df')
    
    pct = ["CommercialData_AreaPcntHHMarriedCoupleNoChild",  
           "CommercialData_AreaPcntHHMarriedCoupleWithChild",
           "CommercialData_AreaPcntHHSpanishSpeaking",
           "CommercialData_AreaPcntHHWithChildren"]
    
    numeric_cols = [
        'Residence_Families_HHCount',
        'Mailing_Families_HHCount',
        'Residence_Addresses_Density',
        "CommercialData_AreaMedianEducationYears",
        "CommercialData_AreaMedianHousingValue"
    ] + pct

    trinary_cols = [
        'CommercialData_DonatesToCharityInHome',
        'CommercialData_ComputerOwnerInHome',
        'CommercialData_DonatesEnvironmentCauseInHome'
    ]

    binary_cols = []

    dont_touch_cols = [
        "General_2008", 
        "Voters_BirthDate", 
        "General_2000",
        "General_2004",
        "PresidentialPrimary_2000",
        "PresidentialPrimary_2004"
    ]
    
    other_cols = [c for c in input_df.columns if c not in dont_touch_cols]
    other_cols = [c for c in other_cols if c not in (numeric_cols + trinary_cols + binary_cols)]

    categorical_cols = other_cols + binary_cols + trinary_cols

    # Impute the missing values in the numerical columns with the mean -- minimize change to z-scores of given data
    imputer = Imputer(
        inputCols=numeric_cols, 
        outputCols=["{}_imp".format(c) for c in numeric_cols]
    )

    input_df = imputer.fit(input_df).transform(input_df)

    # Impute categorical columns -- maybe it's better to drop these records
    # input_df = input_df.fillna("missing", subset = categorical_cols)

    indexed_cols = [f"{c}_ind" for c in categorical_cols]

    # Ecode categorical variables
    indexer = StringIndexer(inputCols = categorical_cols, outputCols = indexed_cols)
    input_df = indexer.fit(input_df).transform(input_df)
    
    return input_df

In [9]:
df_ref = impute_values_function(df_ref)
df_ref.printSchema()

23/03/18 21:52:06 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

root
 |-- Voters_Gender: string (nullable = false)
 |-- Voters_BirthDate: string (nullable = true)
 |-- Residence_Families_HHCount: float (nullable = true)
 |-- Residence_HHGender_Description: string (nullable = false)
 |-- Mailing_Families_HHCount: float (nullable = true)
 |-- Mailing_HHGender_Description: string (nullable = false)
 |-- Parties_Description: string (nullable = false)
 |-- CommercialData_PropertyType: string (nullable = false)
 |-- AddressDistricts_Change_Changed_CD: string (nullable = false)
 |-- AddressDistricts_Change_Changed_SD: string (nullable = false)
 |-- AddressDistricts_Change_Changed_HD: string (nullable = false)
 |-- AddressDistricts_Change_Changed_County: string (nullable = false)
 |-- Residence_Addresses_Density: float (nullable = true)
 |-- CommercialData_EstimatedHHIncome: string (nullable = false)
 |-- CommercialData_ISPSA: string (nullable = false)
 |-- CommercialData_AreaMedianEducationYears: float (nullable = true)
 |-- CommercialData_AreaMedianHousi

In [10]:
df_ref.select("Voters_BirthDate").show(10)

+----------------+
|Voters_BirthDate|
+----------------+
|      12/06/1957|
|      12/13/1953|
|      03/24/1977|
|      08/30/1976|
|      01/01/1986|
|      01/01/1963|
|      01/01/1998|
|      01/01/1997|
|      12/28/1963|
|      11/03/1936|
+----------------+
only showing top 10 rows



In [11]:
def clean_voter_participation(input_df: DataFrame) -> DataFrame:

    yrs_add = 18
    months_add = 18*12

    # date of national 
    target_month_day_presidential = "11-03"

    # date of presidential primary (ideally we should do this state by state, but this is the date for Indiana's)
    target_month_day_primary = "05-03" 

    input_df = input_df.withColumn("DATE_18", add_months(to_date(col("Voters_BirthDate"),"MM/dd/yyyy"), months_add))
    input_df.select(["Voters_BirthDate", "DATE_18"]).show(10)
    input_df = input_df.dropna(subset = "Voters_BirthDate")
    input_df = input_df.withColumn("YEAR_18", year("DATE_18"))
    input_df = input_df.withColumn("comparator_date_presidential", to_date(concat(col("YEAR_18"), lit("-"), lit(target_month_day_presidential))))
    input_df = input_df.withColumn("comparator_date_primary", to_date(concat(col("YEAR_18"), lit("-"), lit(target_month_day_primary))))

    for election in ["PRESIDENTIAL", "PRIMARY"]:
        input_df = input_df.withColumn(f"YEAR_ELIGIBLE_TO_VOTE_{election}", \
                                    when(col("DATE_18")<=col(f"comparator_date_{election.lower()}"), col("YEAR_18")) \
                                   .otherwise(col("YEAR_18") + 1) \
                                  )

    # check no missing vals:
    input_df.where(col("YEAR_18").isNull()).select("YEAR_18").show(10)

    # get rid of rows where the voter was not old enough to vote in the 2008 general election
    input_df = input_df.filter(col("YEAR_ELIGIBLE_TO_VOTE_PRESIDENTIAL")<=2008).fillna("N", subset = ["General_2008"])

    # for the 2000 and 2004 general elections, replace with "N" IF the person was old enough to vote at the time

    for election in ["2000", "2004"]:
        input_df = input_df.withColumn(f"General_{election}", \
                                   when((col("YEAR_ELIGIBLE_TO_VOTE_PRESIDENTIAL")<= int(election)) & \
                                        (col(f"General_{election}").isNull()), "N") \
                                   .otherwise(col(f"General_{election}")) \
                                  )

        input_df = input_df.withColumn(f"PresidentialPrimary_{election}", \
                                   when((col("YEAR_ELIGIBLE_TO_VOTE_PRIMARY")<= int(election)) & \
                                        (col(f"PresidentialPrimary_{election}").isNull()), "N") \
                                   .otherwise(col(f"PresidentialPrimary_{election}")) \
                                  )

    # make the general voting for 2008 a numeric variable; since we've deleted
    # everyone who was not eligible to vote, this can be directly calculated with a 1-0.
    input_df = input_df.withColumn("Voted_General_2008", when(input_df.General_2008 == "Y",1).otherwise(0))
    input_df = input_df.drop("General_2008")
    
    return input_df

In [12]:
df_ref = clean_voter_participation(df_ref)
df_ref.printSchema()

+----------------+----------+
|Voters_BirthDate|   DATE_18|
+----------------+----------+
|      12/06/1957|1975-12-06|
|      12/13/1953|1971-12-13|
|      03/24/1977|1995-03-24|
|      08/30/1976|1994-08-30|
|      01/01/1986|2004-01-01|
|      01/01/1963|1981-01-01|
|      01/01/1998|2016-01-01|
|      01/01/1997|2015-01-01|
|      12/28/1963|1981-12-28|
|      11/03/1936|1954-11-03|
+----------------+----------+
only showing top 10 rows



                                                                                

+-------+
|YEAR_18|
+-------+
+-------+

root
 |-- Voters_Gender: string (nullable = false)
 |-- Voters_BirthDate: string (nullable = true)
 |-- Residence_Families_HHCount: float (nullable = true)
 |-- Residence_HHGender_Description: string (nullable = false)
 |-- Mailing_Families_HHCount: float (nullable = true)
 |-- Mailing_HHGender_Description: string (nullable = false)
 |-- Parties_Description: string (nullable = false)
 |-- CommercialData_PropertyType: string (nullable = false)
 |-- AddressDistricts_Change_Changed_CD: string (nullable = false)
 |-- AddressDistricts_Change_Changed_SD: string (nullable = false)
 |-- AddressDistricts_Change_Changed_HD: string (nullable = false)
 |-- AddressDistricts_Change_Changed_County: string (nullable = false)
 |-- Residence_Addresses_Density: float (nullable = true)
 |-- CommercialData_EstimatedHHIncome: string (nullable = false)
 |-- CommercialData_ISPSA: string (nullable = false)
 |-- CommercialData_AreaMedianEducationYears: float (nullable = 

In [13]:
indi.select("Voters_BirthDate").show(10)

NameError: name 'indi' is not defined

In [22]:
# do the same process for Indiana:
indi = spark.read.parquet("gs://voter-project-235-25/VM2Uniform--IN--2021-01-15_parq")
indi = indi.sample(True, 0.1, seed = 19480384)
indi = indi.select(cols_to_keep)
indi = indi.withColumn("STATE", lit("IN"))

indi = clean_numeric_categorical(indi)
indi = impute_values_function(indi)
indi = clean_voter_participation(indi)
indi.printSchema()

+-------------------------------------+---------------------------------------------+-----------------------------------------------+----------------------------------------+-------------------------------------+
|CommercialData_AreaMedianHousingValue|CommercialData_AreaPcntHHMarriedCoupleNoChild|CommercialData_AreaPcntHHMarriedCoupleWithChild|CommercialData_AreaPcntHHSpanishSpeaking|CommercialData_AreaPcntHHWithChildren|
+-------------------------------------+---------------------------------------------+-----------------------------------------------+----------------------------------------+-------------------------------------+
|                               139950|                                           40|                                             25|                                       1|                                   31|
|                               139950|                                           40|                                             25|               

                                                                                

+----------------+----------+
|Voters_BirthDate|   DATE_18|
+----------------+----------+
|      05/03/1996|2014-05-03|
|      05/03/1996|2014-05-03|
|      12/04/1959|1977-12-04|
|      11/11/1964|1982-11-11|
|      05/21/1925|1943-05-21|
|      07/22/1998|2016-07-22|
|      03/31/1966|1984-03-31|
|      12/10/1983|2001-12-10|
|      01/05/1977|1995-01-05|
|      05/18/1993|2011-05-18|
+----------------+----------+
only showing top 10 rows





+-------+
|YEAR_18|
+-------+
+-------+

root
 |-- Voters_Gender: string (nullable = false)
 |-- Voters_BirthDate: string (nullable = true)
 |-- Residence_Families_HHCount: float (nullable = true)
 |-- Residence_HHGender_Description: string (nullable = false)
 |-- Mailing_Families_HHCount: float (nullable = true)
 |-- Mailing_HHGender_Description: string (nullable = false)
 |-- Parties_Description: string (nullable = false)
 |-- CommercialData_PropertyType: string (nullable = false)
 |-- AddressDistricts_Change_Changed_CD: string (nullable = false)
 |-- AddressDistricts_Change_Changed_SD: string (nullable = false)
 |-- AddressDistricts_Change_Changed_HD: string (nullable = false)
 |-- AddressDistricts_Change_Changed_County: string (nullable = false)
 |-- Residence_Addresses_Density: float (nullable = true)
 |-- CommercialData_EstimatedHHIncome: string (nullable = false)
 |-- CommercialData_ISPSA: integer (nullable = true)
 |-- CommercialData_AreaMedianEducationYears: float (nullable = 

                                                                                

In [23]:
indi.columns

['Voters_Gender',
 'Voters_BirthDate',
 'Residence_Families_HHCount',
 'Residence_HHGender_Description',
 'Mailing_Families_HHCount',
 'Mailing_HHGender_Description',
 'Parties_Description',
 'CommercialData_PropertyType',
 'AddressDistricts_Change_Changed_CD',
 'AddressDistricts_Change_Changed_SD',
 'AddressDistricts_Change_Changed_HD',
 'AddressDistricts_Change_Changed_County',
 'Residence_Addresses_Density',
 'CommercialData_EstimatedHHIncome',
 'CommercialData_ISPSA',
 'CommercialData_AreaMedianEducationYears',
 'CommercialData_AreaMedianHousingValue',
 'CommercialData_AreaPcntHHMarriedCoupleNoChild',
 'CommercialData_AreaPcntHHMarriedCoupleWithChild',
 'CommercialData_AreaPcntHHSpanishSpeaking',
 'CommercialData_AreaPcntHHWithChildren',
 'CommercialData_StateIncomeDecile',
 'EthnicGroups_EthnicGroup1Desc',
 'CommercialData_DwellingType',
 'CommercialData_PresenceOfChildrenCode',
 'CommercialData_DonatesToCharityInHome',
 'CommercialData_DwellingUnitSize',
 'CommercialData_Computer

# Estimator with logistic regression index model

In [33]:
pct = ["CommercialData_AreaPcntHHMarriedCoupleNoChild",  
           "CommercialData_AreaPcntHHMarriedCoupleWithChild",
           "CommercialData_AreaPcntHHSpanishSpeaking",
           "CommercialData_AreaPcntHHWithChildren"]

numeric_cols = [
    'Residence_Families_HHCount',
    'Mailing_Families_HHCount',
    'Residence_Addresses_Density',
    "CommercialData_AreaMedianEducationYears",
    "CommercialData_AreaMedianHousingValue"
] + pct

trinary_cols = [
    'CommercialData_DonatesToCharityInHome',
    'CommercialData_ComputerOwnerInHome',
    'CommercialData_DonatesEnvironmentCauseInHome'
]

binary_cols = []

dont_touch_cols = [
    "General_2008", 
    "Voters_BirthDate", 
    "General_2000",
    "General_2004",
    "PresidentialPrimary_2000",
    "PresidentialPrimary_2004"
]

cols_excluded_from_regression = [
    'Voters_BirthDate', # removed this, but KEPT the YEAR that the voter turned 18.
    'STATE',
    'STATE_ind',
    'DATE_18',
    'comparator_date_presidential',
    'comparator_date_primary',
    'YEAR_ELIGIBLE_TO_VOTE_PRESIDENTIAL',
    'YEAR_ELIGIBLE_TO_VOTE_PRIMARY',
#    'Voted_General_2008'
]

other_cols = [c for c in df_ref.columns if c not in (dont_touch_cols+cols_excluded_from_regression)]
other_cols = [c for c in other_cols if c not in (numeric_cols + trinary_cols + binary_cols)]

categorical_cols = other_cols + binary_cols + trinary_cols

new_df = df_ref.drop(*cols_excluded_from_regression)

In [None]:
new_df.count()

                                                                                

67924252

In [34]:
[f"{c}_ind" for c in categorical_cols]

['Voters_Gender_ind',
 'Residence_HHGender_Description_ind',
 'Mailing_HHGender_Description_ind',
 'Parties_Description_ind',
 'CommercialData_PropertyType_ind',
 'AddressDistricts_Change_Changed_CD_ind',
 'AddressDistricts_Change_Changed_SD_ind',
 'AddressDistricts_Change_Changed_HD_ind',
 'AddressDistricts_Change_Changed_County_ind',
 'CommercialData_EstimatedHHIncome_ind',
 'CommercialData_ISPSA_ind',
 'CommercialData_StateIncomeDecile_ind',
 'EthnicGroups_EthnicGroup1Desc_ind',
 'CommercialData_DwellingType_ind',
 'CommercialData_PresenceOfChildrenCode_ind',
 'CommercialData_DwellingUnitSize_ind',
 'CommercialData_Education_ind',
 'Residence_Families_HHCount_imp_ind',
 'Mailing_Families_HHCount_imp_ind',
 'Residence_Addresses_Density_imp_ind',
 'CommercialData_AreaMedianEducationYears_imp_ind',
 'CommercialData_AreaMedianHousingValue_imp_ind',
 'CommercialData_AreaPcntHHMarriedCoupleNoChild_imp_ind',
 'CommercialData_AreaPcntHHMarriedCoupleWithChild_imp_ind',
 'CommercialData_AreaP

In [31]:
# new_df.groupBy('General_2004').count().show()

# indi_full = spark.read.parquet("gs://voter-project-235-25/VM2Uniform--IN--2021-01-15_parq")
# indi = indi_full.select('General_2004')

# indi.groupBy("General_2004").count().show()

# new_df = new_df.fillna("N", subset = "General_2008")
indexed_cols = [f"{c}_ind" for c in categorical_cols]

indexer = StringIndexer(inputCol = "Voted_General_2008", outputCol = "label")
new_df = indexer.fit(new_df).transform(new_df)

new_df.select(
    ["{}_imp".format(c) for c in numeric_cols] + indexed_cols + ["label"]    
).printSchema()

new_df = new_df.select(
    ["{}_imp".format(c) for c in numeric_cols] + indexed_cols + ["label"]    
)

from pyspark.ml.feature import RFormula
supervised = RFormula(formula="label ~ .")

fittedRF = supervised.fit(new_df)

preparedDF = fittedRF.transform(new_df)

preparedDF.select("features").show(n=10, truncate=False)

train, test = new_df.randomSplit([0.7, 0.3], seed = 42069)

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()

lrModel = lr.fit(train)

featureCols = pd.DataFrame(preparedDF.schema["features"].metadata["ml_attr"]["attrs"]["nominal"]+
  preparedDF.schema["features"].metadata["ml_attr"]["attrs"]["numeric"]).sort_values("idx")

featureCols = featureCols.set_index('idx')
featureCols.head()

plt.rcParams["figure.figsize"] = (8,6)

beta = np.sort(lrModel.coefficients)
plt.plot(beta)
plt.ylabel('Beta Coefficients')

                                                                                

AnalysisException: cannot resolve '`STATE_ind`' given input columns: [input_df.AddressDistricts_Change_Changed_CD, AddressDistricts_Change_Changed_CD_ind, input_df.AddressDistricts_Change_Changed_County, AddressDistricts_Change_Changed_County_ind, input_df.AddressDistricts_Change_Changed_HD, AddressDistricts_Change_Changed_HD_ind, input_df.AddressDistricts_Change_Changed_SD, AddressDistricts_Change_Changed_SD_ind, input_df.CommercialData_AreaMedianEducationYears, CommercialData_AreaMedianEducationYears_imp, input_df.CommercialData_AreaMedianHousingValue, CommercialData_AreaMedianHousingValue_imp, input_df.CommercialData_AreaPcntHHMarriedCoupleNoChild, CommercialData_AreaPcntHHMarriedCoupleNoChild_imp, input_df.CommercialData_AreaPcntHHMarriedCoupleWithChild, CommercialData_AreaPcntHHMarriedCoupleWithChild_imp, input_df.CommercialData_AreaPcntHHSpanishSpeaking, CommercialData_AreaPcntHHSpanishSpeaking_imp, input_df.CommercialData_AreaPcntHHWithChildren, CommercialData_AreaPcntHHWithChildren_imp, input_df.CommercialData_ComputerOwnerInHome, CommercialData_ComputerOwnerInHome_ind, input_df.CommercialData_DonatesEnvironmentCauseInHome, CommercialData_DonatesEnvironmentCauseInHome_ind, input_df.CommercialData_DonatesToCharityInHome, CommercialData_DonatesToCharityInHome_ind, input_df.CommercialData_DwellingType, CommercialData_DwellingType_ind, input_df.CommercialData_DwellingUnitSize, CommercialData_DwellingUnitSize_ind, input_df.CommercialData_Education, CommercialData_Education_ind, input_df.CommercialData_EstimatedHHIncome, CommercialData_EstimatedHHIncome_ind, input_df.CommercialData_ISPSA, CommercialData_ISPSA_ind, input_df.CommercialData_PresenceOfChildrenCode, CommercialData_PresenceOfChildrenCode_ind, input_df.CommercialData_PropertyType, CommercialData_PropertyType_ind, input_df.CommercialData_StateIncomeDecile, CommercialData_StateIncomeDecile_ind, input_df.EthnicGroups_EthnicGroup1Desc, EthnicGroups_EthnicGroup1Desc_ind, General_2000, General_2004, input_df.Mailing_Families_HHCount, Mailing_Families_HHCount_imp, input_df.Mailing_HHGender_Description, Mailing_HHGender_Description_ind, input_df.Parties_Description, Parties_Description_ind, PresidentialPrimary_2000, PresidentialPrimary_2004, input_df.Residence_Addresses_Density, Residence_Addresses_Density_imp, input_df.Residence_Families_HHCount, Residence_Families_HHCount_imp, input_df.Residence_HHGender_Description, Residence_HHGender_Description_ind, Voted_General_2008, input_df.Voters_Gender, Voters_Gender_ind, YEAR_18, label];
'Project [Residence_Families_HHCount_imp#25088, Mailing_Families_HHCount_imp#25089, Residence_Addresses_Density_imp#25090, CommercialData_AreaMedianEducationYears_imp#25091, CommercialData_AreaMedianHousingValue_imp#25092, CommercialData_AreaPcntHHMarriedCoupleNoChild_imp#25093, CommercialData_AreaPcntHHMarriedCoupleWithChild_imp#25094, CommercialData_AreaPcntHHSpanishSpeaking_imp#25095, CommercialData_AreaPcntHHWithChildren_imp#25096, Voters_Gender_ind#25637, Residence_HHGender_Description_ind#25638, Mailing_HHGender_Description_ind#25639, Parties_Description_ind#25640, CommercialData_PropertyType_ind#25641, AddressDistricts_Change_Changed_CD_ind#25642, AddressDistricts_Change_Changed_SD_ind#25643, AddressDistricts_Change_Changed_HD_ind#25644, AddressDistricts_Change_Changed_County_ind#25645, CommercialData_EstimatedHHIncome_ind#25646, CommercialData_ISPSA_ind#25647, CommercialData_StateIncomeDecile_ind#25648, EthnicGroups_EthnicGroup1Desc_ind#25649, CommercialData_DwellingType_ind#25650, CommercialData_PresenceOfChildrenCode_ind#25651, ... 44 more fields]
+- Project [Voters_Gender#24739, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, CommercialData_PresenceOfChildrenCode#24753, ... 41 more fields]
   +- Project [Voters_Gender#24739, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, CommercialData_PresenceOfChildrenCode#24753, ... 40 more fields]
      +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 48 more fields]
         +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 49 more fields]
            +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 48 more fields]
               +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 48 more fields]
                  +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 48 more fields]
                     +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 48 more fields]
                        +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 48 more fields]
                           +- Filter (YEAR_ELIGIBLE_TO_VOTE_PRESIDENTIAL#26191 <= 2008)
                              +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 48 more fields]
                                 +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 47 more fields]
                                    +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 46 more fields]
                                       +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 45 more fields]
                                          +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 44 more fields]
                                             +- Filter AtLeastNNulls(n, Voters_BirthDate#67)
                                                +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 43 more fields]
                                                   +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 42 more fields]
                                                      +- Project [Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#24741, Parties_Description#24742, CommercialData_PropertyType#24743, AddressDistricts_Change_Changed_CD#24744, AddressDistricts_Change_Changed_SD#24745, AddressDistricts_Change_Changed_HD#24746, AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#24748, CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#24750, EthnicGroups_EthnicGroup1Desc#24751, CommercialData_DwellingType#24752, ... 21 more fields]
                                                         +- SubqueryAlias input_df
                                                            +- Project [coalesce(Voters_Gender#65, cast(Missing as string)) AS Voters_Gender#24739, Voters_BirthDate#67, Residence_Families_HHCount#24287, coalesce(Residence_HHGender_Description#45, cast(Missing as string)) AS Residence_HHGender_Description#24740, Mailing_Families_HHCount#24325, coalesce(Mailing_HHGender_Description#63, cast(Missing as string)) AS Mailing_HHGender_Description#24741, coalesce(Parties_Description#69, cast(Missing as string)) AS Parties_Description#24742, coalesce(CommercialData_PropertyType#374, cast(Missing as string)) AS CommercialData_PropertyType#24743, coalesce(AddressDistricts_Change_Changed_CD#84, cast(Missing as string)) AS AddressDistricts_Change_Changed_CD#24744, coalesce(AddressDistricts_Change_Changed_SD#86, cast(Missing as string)) AS AddressDistricts_Change_Changed_SD#24745, coalesce(AddressDistricts_Change_Changed_HD#88, cast(Missing as string)) AS AddressDistricts_Change_Changed_HD#24746, coalesce(AddressDistricts_Change_Changed_County#97, cast(Missing as string)) AS AddressDistricts_Change_Changed_County#24747, Residence_Addresses_Density#24363, coalesce(CommercialData_EstimatedHHIncome#346, cast(Missing as string)) AS CommercialData_EstimatedHHIncome#24748, coalesce(CommercialData_ISPSA#357, cast(Missing as string)) AS CommercialData_ISPSA#24749, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, coalesce(CommercialData_StateIncomeDecile#377, cast(Missing as string)) AS CommercialData_StateIncomeDecile#24750, coalesce(EthnicGroups_EthnicGroup1Desc#72, cast(Missing as string)) AS EthnicGroups_EthnicGroup1Desc#24751, coalesce(CommercialData_DwellingType#342, cast(Missing as string)) AS CommercialData_DwellingType#24752, ... 12 more fields]
                                                               +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#45, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                  +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#45, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24553, cast(CommercialData_AreaPcntHHWithChildren#24153 as float) AS CommercialData_AreaPcntHHWithChildren#24591, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                     +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#45, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, cast(CommercialData_AreaPcntHHSpanishSpeaking#24116 as float) AS CommercialData_AreaPcntHHSpanishSpeaking#24553, CommercialData_AreaPcntHHWithChildren#24153, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                        +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#45, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, cast(CommercialData_AreaPcntHHMarriedCoupleWithChild#24079 as float) AS CommercialData_AreaPcntHHMarriedCoupleWithChild#24515, CommercialData_AreaPcntHHSpanishSpeaking#24116, CommercialData_AreaPcntHHWithChildren#24153, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                           +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#45, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24439, cast(CommercialData_AreaPcntHHMarriedCoupleNoChild#24042 as float) AS CommercialData_AreaPcntHHMarriedCoupleNoChild#24477, CommercialData_AreaPcntHHMarriedCoupleWithChild#24079, CommercialData_AreaPcntHHSpanishSpeaking#24116, CommercialData_AreaPcntHHWithChildren#24153, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                              +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#45, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#24401, cast(CommercialData_AreaMedianHousingValue#24005 as float) AS CommercialData_AreaMedianHousingValue#24439, CommercialData_AreaPcntHHMarriedCoupleNoChild#24042, CommercialData_AreaPcntHHMarriedCoupleWithChild#24079, CommercialData_AreaPcntHHSpanishSpeaking#24116, CommercialData_AreaPcntHHWithChildren#24153, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                                 +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#45, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, cast(CommercialData_AreaMedianEducationYears#360 as float) AS CommercialData_AreaMedianEducationYears#24401, CommercialData_AreaMedianHousingValue#24005, CommercialData_AreaPcntHHMarriedCoupleNoChild#24042, CommercialData_AreaPcntHHMarriedCoupleWithChild#24079, CommercialData_AreaPcntHHSpanishSpeaking#24116, CommercialData_AreaPcntHHWithChildren#24153, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                                    +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#45, Mailing_Families_HHCount#24325, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, cast(Residence_Addresses_Density#42 as float) AS Residence_Addresses_Density#24363, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#360, CommercialData_AreaMedianHousingValue#24005, CommercialData_AreaPcntHHMarriedCoupleNoChild#24042, CommercialData_AreaPcntHHMarriedCoupleWithChild#24079, CommercialData_AreaPcntHHSpanishSpeaking#24116, CommercialData_AreaPcntHHWithChildren#24153, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                                       +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#24287, Residence_HHGender_Description#45, cast(Mailing_Families_HHCount#62 as float) AS Mailing_Families_HHCount#24325, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#42, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#360, CommercialData_AreaMedianHousingValue#24005, CommercialData_AreaPcntHHMarriedCoupleNoChild#24042, CommercialData_AreaPcntHHMarriedCoupleWithChild#24079, CommercialData_AreaPcntHHSpanishSpeaking#24116, CommercialData_AreaPcntHHWithChildren#24153, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                                          +- Project [Voters_Gender#65, Voters_BirthDate#67, cast(Residence_Families_HHCount#44 as float) AS Residence_Families_HHCount#24287, Residence_HHGender_Description#45, Mailing_Families_HHCount#62, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#42, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#360, CommercialData_AreaMedianHousingValue#24005, CommercialData_AreaPcntHHMarriedCoupleNoChild#24042, CommercialData_AreaPcntHHMarriedCoupleWithChild#24079, CommercialData_AreaPcntHHSpanishSpeaking#24116, CommercialData_AreaPcntHHWithChildren#24153, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                                             +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#44, Residence_HHGender_Description#45, Mailing_Families_HHCount#62, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#42, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#360, CommercialData_AreaMedianHousingValue#24005, CommercialData_AreaPcntHHMarriedCoupleNoChild#24042, CommercialData_AreaPcntHHMarriedCoupleWithChild#24079, CommercialData_AreaPcntHHSpanishSpeaking#24116, substring(CommercialData_AreaPcntHHWithChildren#370, 1, (length(CommercialData_AreaPcntHHWithChildren#370) - 1)) AS CommercialData_AreaPcntHHWithChildren#24153, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                                                +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#44, Residence_HHGender_Description#45, Mailing_Families_HHCount#62, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#42, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#360, CommercialData_AreaMedianHousingValue#24005, CommercialData_AreaPcntHHMarriedCoupleNoChild#24042, CommercialData_AreaPcntHHMarriedCoupleWithChild#24079, substring(CommercialData_AreaPcntHHSpanishSpeaking#369, 1, (length(CommercialData_AreaPcntHHSpanishSpeaking#369) - 1)) AS CommercialData_AreaPcntHHSpanishSpeaking#24116, CommercialData_AreaPcntHHWithChildren#370, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                                                   +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#44, Residence_HHGender_Description#45, Mailing_Families_HHCount#62, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#42, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#360, CommercialData_AreaMedianHousingValue#24005, CommercialData_AreaPcntHHMarriedCoupleNoChild#24042, substring(CommercialData_AreaPcntHHMarriedCoupleWithChild#368, 1, (length(CommercialData_AreaPcntHHMarriedCoupleWithChild#368) - 1)) AS CommercialData_AreaPcntHHMarriedCoupleWithChild#24079, CommercialData_AreaPcntHHSpanishSpeaking#369, CommercialData_AreaPcntHHWithChildren#370, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                                                      +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#44, Residence_HHGender_Description#45, Mailing_Families_HHCount#62, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#42, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#360, CommercialData_AreaMedianHousingValue#24005, substring(CommercialData_AreaPcntHHMarriedCoupleNoChild#367, 1, (length(CommercialData_AreaPcntHHMarriedCoupleNoChild#367) - 1)) AS CommercialData_AreaPcntHHMarriedCoupleNoChild#24042, CommercialData_AreaPcntHHMarriedCoupleWithChild#368, CommercialData_AreaPcntHHSpanishSpeaking#369, CommercialData_AreaPcntHHWithChildren#370, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                                                         +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#44, Residence_HHGender_Description#45, Mailing_Families_HHCount#62, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#42, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#360, substring(CommercialData_AreaMedianHousingValue#361, 2, length(CommercialData_AreaMedianHousingValue#361)) AS CommercialData_AreaMedianHousingValue#24005, CommercialData_AreaPcntHHMarriedCoupleNoChild#367, CommercialData_AreaPcntHHMarriedCoupleWithChild#368, CommercialData_AreaPcntHHSpanishSpeaking#369, CommercialData_AreaPcntHHWithChildren#370, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                                                            +- Union false, false
                                                                                                               :- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#44, Residence_HHGender_Description#45, Mailing_Families_HHCount#62, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#42, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#360, CommercialData_AreaMedianHousingValue#361, CommercialData_AreaPcntHHMarriedCoupleNoChild#367, CommercialData_AreaPcntHHMarriedCoupleWithChild#368, CommercialData_AreaPcntHHSpanishSpeaking#369, CommercialData_AreaPcntHHWithChildren#370, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#65, Voters_BirthDate#67, Residence_Families_HHCount#44, Residence_HHGender_Description#45, Mailing_Families_HHCount#62, Mailing_HHGender_Description#63, Parties_Description#69, CommercialData_PropertyType#374, AddressDistricts_Change_Changed_CD#84, AddressDistricts_Change_Changed_SD#86, AddressDistricts_Change_Changed_HD#88, AddressDistricts_Change_Changed_County#97, Residence_Addresses_Density#42, CommercialData_EstimatedHHIncome#346, CommercialData_ISPSA#357, CommercialData_AreaMedianEducationYears#360, CommercialData_AreaMedianHousingValue#361, CommercialData_AreaPcntHHMarriedCoupleNoChild#367, CommercialData_AreaPcntHHMarriedCoupleWithChild#368, CommercialData_AreaPcntHHSpanishSpeaking#369, CommercialData_AreaPcntHHWithChildren#370, CommercialData_StateIncomeDecile#377, EthnicGroups_EthnicGroup1Desc#72, CommercialData_DwellingType#342, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#0,LALVOTERID#1,Voters_Active#2,Voters_StateVoterID#3,Voters_CountyVoterID#4,VoterTelephones_LandlineAreaCode#5,VoterTelephones_Landline7Digit#6,VoterTelephones_LandlineFormatted#7,VoterTelephones_LandlineUnformatted#8,VoterTelephones_LandlineConfidenceCode#9,VoterTelephones_CellPhoneOnly#10,VoterTelephones_CellPhoneFormatted#11,VoterTelephones_CellPhoneUnformatted#12,VoterTelephones_CellConfidenceCode#13,Voters_FirstName#14,Voters_MiddleName#15,Voters_LastName#16,Voters_NameSuffix#17,Residence_Addresses_AddressLine#18,Residence_Addresses_ExtraAddressLine#19,Residence_Addresses_City#20,Residence_Addresses_State#21,Residence_Addresses_Zip#22,Residence_Addresses_ZipPlus4#23,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#1629, Voters_BirthDate#1631, Residence_Families_HHCount#1608, Residence_HHGender_Description#1609, Mailing_Families_HHCount#1626, Mailing_HHGender_Description#1627, Parties_Description#1633, CommercialData_PropertyType#1938, AddressDistricts_Change_Changed_CD#1648, AddressDistricts_Change_Changed_SD#1650, AddressDistricts_Change_Changed_HD#1652, AddressDistricts_Change_Changed_County#1661, Residence_Addresses_Density#1606, CommercialData_EstimatedHHIncome#1910, CommercialData_ISPSA#1921, CommercialData_AreaMedianEducationYears#1924, CommercialData_AreaMedianHousingValue#1925, CommercialData_AreaPcntHHMarriedCoupleNoChild#1931, CommercialData_AreaPcntHHMarriedCoupleWithChild#1932, CommercialData_AreaPcntHHSpanishSpeaking#1933, CommercialData_AreaPcntHHWithChildren#1934, CommercialData_StateIncomeDecile#1941, EthnicGroups_EthnicGroup1Desc#1636, CommercialData_DwellingType#1906, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#1629, Voters_BirthDate#1631, Residence_Families_HHCount#1608, Residence_HHGender_Description#1609, Mailing_Families_HHCount#1626, Mailing_HHGender_Description#1627, Parties_Description#1633, CommercialData_PropertyType#1938, AddressDistricts_Change_Changed_CD#1648, AddressDistricts_Change_Changed_SD#1650, AddressDistricts_Change_Changed_HD#1652, AddressDistricts_Change_Changed_County#1661, Residence_Addresses_Density#1606, CommercialData_EstimatedHHIncome#1910, CommercialData_ISPSA#1921, CommercialData_AreaMedianEducationYears#1924, CommercialData_AreaMedianHousingValue#1925, CommercialData_AreaPcntHHMarriedCoupleNoChild#1931, CommercialData_AreaPcntHHMarriedCoupleWithChild#1932, CommercialData_AreaPcntHHSpanishSpeaking#1933, CommercialData_AreaPcntHHWithChildren#1934, CommercialData_StateIncomeDecile#1941, EthnicGroups_EthnicGroup1Desc#1636, CommercialData_DwellingType#1906, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#1564,LALVOTERID#1565,Voters_Active#1566,Voters_StateVoterID#1567,Voters_CountyVoterID#1568,VoterTelephones_LandlineAreaCode#1569,VoterTelephones_Landline7Digit#1570,VoterTelephones_LandlineFormatted#1571,VoterTelephones_LandlineUnformatted#1572,VoterTelephones_LandlineConfidenceCode#1573,VoterTelephones_CellPhoneOnly#1574,VoterTelephones_CellPhoneFormatted#1575,VoterTelephones_CellPhoneUnformatted#1576,VoterTelephones_CellConfidenceCode#1577,Voters_FirstName#1578,Voters_MiddleName#1579,Voters_LastName#1580,Voters_NameSuffix#1581,Residence_Addresses_AddressLine#1582,Residence_Addresses_ExtraAddressLine#1583,Residence_Addresses_City#1584,Residence_Addresses_State#1585,Residence_Addresses_Zip#1586,Residence_Addresses_ZipPlus4#1587,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#3229, Voters_BirthDate#3231, Residence_Families_HHCount#3208, Residence_HHGender_Description#3209, Mailing_Families_HHCount#3226, Mailing_HHGender_Description#3227, Parties_Description#3233, CommercialData_PropertyType#3538, AddressDistricts_Change_Changed_CD#3248, AddressDistricts_Change_Changed_SD#3250, AddressDistricts_Change_Changed_HD#3252, AddressDistricts_Change_Changed_County#3261, Residence_Addresses_Density#3206, CommercialData_EstimatedHHIncome#3510, CommercialData_ISPSA#3521, CommercialData_AreaMedianEducationYears#3524, CommercialData_AreaMedianHousingValue#3525, CommercialData_AreaPcntHHMarriedCoupleNoChild#3531, CommercialData_AreaPcntHHMarriedCoupleWithChild#3532, CommercialData_AreaPcntHHSpanishSpeaking#3533, CommercialData_AreaPcntHHWithChildren#3534, CommercialData_StateIncomeDecile#3541, EthnicGroups_EthnicGroup1Desc#3236, CommercialData_DwellingType#3506, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#3229, Voters_BirthDate#3231, Residence_Families_HHCount#3208, Residence_HHGender_Description#3209, Mailing_Families_HHCount#3226, Mailing_HHGender_Description#3227, Parties_Description#3233, CommercialData_PropertyType#3538, AddressDistricts_Change_Changed_CD#3248, AddressDistricts_Change_Changed_SD#3250, AddressDistricts_Change_Changed_HD#3252, AddressDistricts_Change_Changed_County#3261, Residence_Addresses_Density#3206, CommercialData_EstimatedHHIncome#3510, CommercialData_ISPSA#3521, CommercialData_AreaMedianEducationYears#3524, CommercialData_AreaMedianHousingValue#3525, CommercialData_AreaPcntHHMarriedCoupleNoChild#3531, CommercialData_AreaPcntHHMarriedCoupleWithChild#3532, CommercialData_AreaPcntHHSpanishSpeaking#3533, CommercialData_AreaPcntHHWithChildren#3534, CommercialData_StateIncomeDecile#3541, EthnicGroups_EthnicGroup1Desc#3236, CommercialData_DwellingType#3506, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#3164,LALVOTERID#3165,Voters_Active#3166,Voters_StateVoterID#3167,Voters_CountyVoterID#3168,VoterTelephones_LandlineAreaCode#3169,VoterTelephones_Landline7Digit#3170,VoterTelephones_LandlineFormatted#3171,VoterTelephones_LandlineUnformatted#3172,VoterTelephones_LandlineConfidenceCode#3173,VoterTelephones_CellPhoneOnly#3174,VoterTelephones_CellPhoneFormatted#3175,VoterTelephones_CellPhoneUnformatted#3176,VoterTelephones_CellConfidenceCode#3177,Voters_FirstName#3178,Voters_MiddleName#3179,Voters_LastName#3180,Voters_NameSuffix#3181,Residence_Addresses_AddressLine#3182,Residence_Addresses_ExtraAddressLine#3183,Residence_Addresses_City#3184,Residence_Addresses_State#3185,Residence_Addresses_Zip#3186,Residence_Addresses_ZipPlus4#3187,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#4829, Voters_BirthDate#4831, Residence_Families_HHCount#4808, Residence_HHGender_Description#4809, Mailing_Families_HHCount#4826, Mailing_HHGender_Description#4827, Parties_Description#4833, CommercialData_PropertyType#5138, AddressDistricts_Change_Changed_CD#4848, AddressDistricts_Change_Changed_SD#4850, AddressDistricts_Change_Changed_HD#4852, AddressDistricts_Change_Changed_County#4861, Residence_Addresses_Density#4806, CommercialData_EstimatedHHIncome#5110, CommercialData_ISPSA#5121, CommercialData_AreaMedianEducationYears#5124, CommercialData_AreaMedianHousingValue#5125, CommercialData_AreaPcntHHMarriedCoupleNoChild#5131, CommercialData_AreaPcntHHMarriedCoupleWithChild#5132, CommercialData_AreaPcntHHSpanishSpeaking#5133, CommercialData_AreaPcntHHWithChildren#5134, CommercialData_StateIncomeDecile#5141, EthnicGroups_EthnicGroup1Desc#4836, CommercialData_DwellingType#5106, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#4829, Voters_BirthDate#4831, Residence_Families_HHCount#4808, Residence_HHGender_Description#4809, Mailing_Families_HHCount#4826, Mailing_HHGender_Description#4827, Parties_Description#4833, CommercialData_PropertyType#5138, AddressDistricts_Change_Changed_CD#4848, AddressDistricts_Change_Changed_SD#4850, AddressDistricts_Change_Changed_HD#4852, AddressDistricts_Change_Changed_County#4861, Residence_Addresses_Density#4806, CommercialData_EstimatedHHIncome#5110, CommercialData_ISPSA#5121, CommercialData_AreaMedianEducationYears#5124, CommercialData_AreaMedianHousingValue#5125, CommercialData_AreaPcntHHMarriedCoupleNoChild#5131, CommercialData_AreaPcntHHMarriedCoupleWithChild#5132, CommercialData_AreaPcntHHSpanishSpeaking#5133, CommercialData_AreaPcntHHWithChildren#5134, CommercialData_StateIncomeDecile#5141, EthnicGroups_EthnicGroup1Desc#4836, CommercialData_DwellingType#5106, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#4764,LALVOTERID#4765,Voters_Active#4766,Voters_StateVoterID#4767,Voters_CountyVoterID#4768,VoterTelephones_LandlineAreaCode#4769,VoterTelephones_Landline7Digit#4770,VoterTelephones_LandlineFormatted#4771,VoterTelephones_LandlineUnformatted#4772,VoterTelephones_LandlineConfidenceCode#4773,VoterTelephones_CellPhoneOnly#4774,VoterTelephones_CellPhoneFormatted#4775,VoterTelephones_CellPhoneUnformatted#4776,VoterTelephones_CellConfidenceCode#4777,Voters_FirstName#4778,Voters_MiddleName#4779,Voters_LastName#4780,Voters_NameSuffix#4781,Residence_Addresses_AddressLine#4782,Residence_Addresses_ExtraAddressLine#4783,Residence_Addresses_City#4784,Residence_Addresses_State#4785,Residence_Addresses_Zip#4786,Residence_Addresses_ZipPlus4#4787,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#6429, Voters_BirthDate#6431, Residence_Families_HHCount#6408, Residence_HHGender_Description#6409, Mailing_Families_HHCount#6426, Mailing_HHGender_Description#6427, Parties_Description#6433, CommercialData_PropertyType#6738, AddressDistricts_Change_Changed_CD#6448, AddressDistricts_Change_Changed_SD#6450, AddressDistricts_Change_Changed_HD#6452, AddressDistricts_Change_Changed_County#6461, Residence_Addresses_Density#6406, CommercialData_EstimatedHHIncome#6710, CommercialData_ISPSA#6721, CommercialData_AreaMedianEducationYears#6724, CommercialData_AreaMedianHousingValue#6725, CommercialData_AreaPcntHHMarriedCoupleNoChild#6731, CommercialData_AreaPcntHHMarriedCoupleWithChild#6732, CommercialData_AreaPcntHHSpanishSpeaking#6733, CommercialData_AreaPcntHHWithChildren#6734, CommercialData_StateIncomeDecile#6741, EthnicGroups_EthnicGroup1Desc#6436, CommercialData_DwellingType#6706, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#6429, Voters_BirthDate#6431, Residence_Families_HHCount#6408, Residence_HHGender_Description#6409, Mailing_Families_HHCount#6426, Mailing_HHGender_Description#6427, Parties_Description#6433, CommercialData_PropertyType#6738, AddressDistricts_Change_Changed_CD#6448, AddressDistricts_Change_Changed_SD#6450, AddressDistricts_Change_Changed_HD#6452, AddressDistricts_Change_Changed_County#6461, Residence_Addresses_Density#6406, CommercialData_EstimatedHHIncome#6710, CommercialData_ISPSA#6721, CommercialData_AreaMedianEducationYears#6724, CommercialData_AreaMedianHousingValue#6725, CommercialData_AreaPcntHHMarriedCoupleNoChild#6731, CommercialData_AreaPcntHHMarriedCoupleWithChild#6732, CommercialData_AreaPcntHHSpanishSpeaking#6733, CommercialData_AreaPcntHHWithChildren#6734, CommercialData_StateIncomeDecile#6741, EthnicGroups_EthnicGroup1Desc#6436, CommercialData_DwellingType#6706, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#6364,LALVOTERID#6365,Voters_Active#6366,Voters_StateVoterID#6367,Voters_CountyVoterID#6368,VoterTelephones_LandlineAreaCode#6369,VoterTelephones_Landline7Digit#6370,VoterTelephones_LandlineFormatted#6371,VoterTelephones_LandlineUnformatted#6372,VoterTelephones_LandlineConfidenceCode#6373,VoterTelephones_CellPhoneOnly#6374,VoterTelephones_CellPhoneFormatted#6375,VoterTelephones_CellPhoneUnformatted#6376,VoterTelephones_CellConfidenceCode#6377,Voters_FirstName#6378,Voters_MiddleName#6379,Voters_LastName#6380,Voters_NameSuffix#6381,Residence_Addresses_AddressLine#6382,Residence_Addresses_ExtraAddressLine#6383,Residence_Addresses_City#6384,Residence_Addresses_State#6385,Residence_Addresses_Zip#6386,Residence_Addresses_ZipPlus4#6387,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#8029, Voters_BirthDate#8031, Residence_Families_HHCount#8008, Residence_HHGender_Description#8009, Mailing_Families_HHCount#8026, Mailing_HHGender_Description#8027, Parties_Description#8033, CommercialData_PropertyType#8338, AddressDistricts_Change_Changed_CD#8048, AddressDistricts_Change_Changed_SD#8050, AddressDistricts_Change_Changed_HD#8052, AddressDistricts_Change_Changed_County#8061, Residence_Addresses_Density#8006, CommercialData_EstimatedHHIncome#8310, CommercialData_ISPSA#8321, CommercialData_AreaMedianEducationYears#8324, CommercialData_AreaMedianHousingValue#8325, CommercialData_AreaPcntHHMarriedCoupleNoChild#8331, CommercialData_AreaPcntHHMarriedCoupleWithChild#8332, CommercialData_AreaPcntHHSpanishSpeaking#8333, CommercialData_AreaPcntHHWithChildren#8334, CommercialData_StateIncomeDecile#8341, EthnicGroups_EthnicGroup1Desc#8036, CommercialData_DwellingType#8306, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#8029, Voters_BirthDate#8031, Residence_Families_HHCount#8008, Residence_HHGender_Description#8009, Mailing_Families_HHCount#8026, Mailing_HHGender_Description#8027, Parties_Description#8033, CommercialData_PropertyType#8338, AddressDistricts_Change_Changed_CD#8048, AddressDistricts_Change_Changed_SD#8050, AddressDistricts_Change_Changed_HD#8052, AddressDistricts_Change_Changed_County#8061, Residence_Addresses_Density#8006, CommercialData_EstimatedHHIncome#8310, CommercialData_ISPSA#8321, CommercialData_AreaMedianEducationYears#8324, CommercialData_AreaMedianHousingValue#8325, CommercialData_AreaPcntHHMarriedCoupleNoChild#8331, CommercialData_AreaPcntHHMarriedCoupleWithChild#8332, CommercialData_AreaPcntHHSpanishSpeaking#8333, CommercialData_AreaPcntHHWithChildren#8334, CommercialData_StateIncomeDecile#8341, EthnicGroups_EthnicGroup1Desc#8036, CommercialData_DwellingType#8306, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#7964,LALVOTERID#7965,Voters_Active#7966,Voters_StateVoterID#7967,Voters_CountyVoterID#7968,VoterTelephones_LandlineAreaCode#7969,VoterTelephones_Landline7Digit#7970,VoterTelephones_LandlineFormatted#7971,VoterTelephones_LandlineUnformatted#7972,VoterTelephones_LandlineConfidenceCode#7973,VoterTelephones_CellPhoneOnly#7974,VoterTelephones_CellPhoneFormatted#7975,VoterTelephones_CellPhoneUnformatted#7976,VoterTelephones_CellConfidenceCode#7977,Voters_FirstName#7978,Voters_MiddleName#7979,Voters_LastName#7980,Voters_NameSuffix#7981,Residence_Addresses_AddressLine#7982,Residence_Addresses_ExtraAddressLine#7983,Residence_Addresses_City#7984,Residence_Addresses_State#7985,Residence_Addresses_Zip#7986,Residence_Addresses_ZipPlus4#7987,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#9629, Voters_BirthDate#9631, Residence_Families_HHCount#9608, Residence_HHGender_Description#9609, Mailing_Families_HHCount#9626, Mailing_HHGender_Description#9627, Parties_Description#9633, CommercialData_PropertyType#9938, AddressDistricts_Change_Changed_CD#9648, AddressDistricts_Change_Changed_SD#9650, AddressDistricts_Change_Changed_HD#9652, AddressDistricts_Change_Changed_County#9661, Residence_Addresses_Density#9606, CommercialData_EstimatedHHIncome#9910, CommercialData_ISPSA#9921, CommercialData_AreaMedianEducationYears#9924, CommercialData_AreaMedianHousingValue#9925, CommercialData_AreaPcntHHMarriedCoupleNoChild#9931, CommercialData_AreaPcntHHMarriedCoupleWithChild#9932, CommercialData_AreaPcntHHSpanishSpeaking#9933, CommercialData_AreaPcntHHWithChildren#9934, CommercialData_StateIncomeDecile#9941, EthnicGroups_EthnicGroup1Desc#9636, CommercialData_DwellingType#9906, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#9629, Voters_BirthDate#9631, Residence_Families_HHCount#9608, Residence_HHGender_Description#9609, Mailing_Families_HHCount#9626, Mailing_HHGender_Description#9627, Parties_Description#9633, CommercialData_PropertyType#9938, AddressDistricts_Change_Changed_CD#9648, AddressDistricts_Change_Changed_SD#9650, AddressDistricts_Change_Changed_HD#9652, AddressDistricts_Change_Changed_County#9661, Residence_Addresses_Density#9606, CommercialData_EstimatedHHIncome#9910, CommercialData_ISPSA#9921, CommercialData_AreaMedianEducationYears#9924, CommercialData_AreaMedianHousingValue#9925, CommercialData_AreaPcntHHMarriedCoupleNoChild#9931, CommercialData_AreaPcntHHMarriedCoupleWithChild#9932, CommercialData_AreaPcntHHSpanishSpeaking#9933, CommercialData_AreaPcntHHWithChildren#9934, CommercialData_StateIncomeDecile#9941, EthnicGroups_EthnicGroup1Desc#9636, CommercialData_DwellingType#9906, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#9564,LALVOTERID#9565,Voters_Active#9566,Voters_StateVoterID#9567,Voters_CountyVoterID#9568,VoterTelephones_LandlineAreaCode#9569,VoterTelephones_Landline7Digit#9570,VoterTelephones_LandlineFormatted#9571,VoterTelephones_LandlineUnformatted#9572,VoterTelephones_LandlineConfidenceCode#9573,VoterTelephones_CellPhoneOnly#9574,VoterTelephones_CellPhoneFormatted#9575,VoterTelephones_CellPhoneUnformatted#9576,VoterTelephones_CellConfidenceCode#9577,Voters_FirstName#9578,Voters_MiddleName#9579,Voters_LastName#9580,Voters_NameSuffix#9581,Residence_Addresses_AddressLine#9582,Residence_Addresses_ExtraAddressLine#9583,Residence_Addresses_City#9584,Residence_Addresses_State#9585,Residence_Addresses_Zip#9586,Residence_Addresses_ZipPlus4#9587,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#11229, Voters_BirthDate#11231, Residence_Families_HHCount#11208, Residence_HHGender_Description#11209, Mailing_Families_HHCount#11226, Mailing_HHGender_Description#11227, Parties_Description#11233, CommercialData_PropertyType#11538, AddressDistricts_Change_Changed_CD#11248, AddressDistricts_Change_Changed_SD#11250, AddressDistricts_Change_Changed_HD#11252, AddressDistricts_Change_Changed_County#11261, Residence_Addresses_Density#11206, CommercialData_EstimatedHHIncome#11510, CommercialData_ISPSA#11521, CommercialData_AreaMedianEducationYears#11524, CommercialData_AreaMedianHousingValue#11525, CommercialData_AreaPcntHHMarriedCoupleNoChild#11531, CommercialData_AreaPcntHHMarriedCoupleWithChild#11532, CommercialData_AreaPcntHHSpanishSpeaking#11533, CommercialData_AreaPcntHHWithChildren#11534, CommercialData_StateIncomeDecile#11541, EthnicGroups_EthnicGroup1Desc#11236, CommercialData_DwellingType#11506, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#11229, Voters_BirthDate#11231, Residence_Families_HHCount#11208, Residence_HHGender_Description#11209, Mailing_Families_HHCount#11226, Mailing_HHGender_Description#11227, Parties_Description#11233, CommercialData_PropertyType#11538, AddressDistricts_Change_Changed_CD#11248, AddressDistricts_Change_Changed_SD#11250, AddressDistricts_Change_Changed_HD#11252, AddressDistricts_Change_Changed_County#11261, Residence_Addresses_Density#11206, CommercialData_EstimatedHHIncome#11510, CommercialData_ISPSA#11521, CommercialData_AreaMedianEducationYears#11524, CommercialData_AreaMedianHousingValue#11525, CommercialData_AreaPcntHHMarriedCoupleNoChild#11531, CommercialData_AreaPcntHHMarriedCoupleWithChild#11532, CommercialData_AreaPcntHHSpanishSpeaking#11533, CommercialData_AreaPcntHHWithChildren#11534, CommercialData_StateIncomeDecile#11541, EthnicGroups_EthnicGroup1Desc#11236, CommercialData_DwellingType#11506, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#11164,LALVOTERID#11165,Voters_Active#11166,Voters_StateVoterID#11167,Voters_CountyVoterID#11168,VoterTelephones_LandlineAreaCode#11169,VoterTelephones_Landline7Digit#11170,VoterTelephones_LandlineFormatted#11171,VoterTelephones_LandlineUnformatted#11172,VoterTelephones_LandlineConfidenceCode#11173,VoterTelephones_CellPhoneOnly#11174,VoterTelephones_CellPhoneFormatted#11175,VoterTelephones_CellPhoneUnformatted#11176,VoterTelephones_CellConfidenceCode#11177,Voters_FirstName#11178,Voters_MiddleName#11179,Voters_LastName#11180,Voters_NameSuffix#11181,Residence_Addresses_AddressLine#11182,Residence_Addresses_ExtraAddressLine#11183,Residence_Addresses_City#11184,Residence_Addresses_State#11185,Residence_Addresses_Zip#11186,Residence_Addresses_ZipPlus4#11187,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#12829, Voters_BirthDate#12831, Residence_Families_HHCount#12808, Residence_HHGender_Description#12809, Mailing_Families_HHCount#12826, Mailing_HHGender_Description#12827, Parties_Description#12833, CommercialData_PropertyType#13138, AddressDistricts_Change_Changed_CD#12848, AddressDistricts_Change_Changed_SD#12850, AddressDistricts_Change_Changed_HD#12852, AddressDistricts_Change_Changed_County#12861, Residence_Addresses_Density#12806, CommercialData_EstimatedHHIncome#13110, CommercialData_ISPSA#13121, CommercialData_AreaMedianEducationYears#13124, CommercialData_AreaMedianHousingValue#13125, CommercialData_AreaPcntHHMarriedCoupleNoChild#13131, CommercialData_AreaPcntHHMarriedCoupleWithChild#13132, CommercialData_AreaPcntHHSpanishSpeaking#13133, CommercialData_AreaPcntHHWithChildren#13134, CommercialData_StateIncomeDecile#13141, EthnicGroups_EthnicGroup1Desc#12836, CommercialData_DwellingType#13106, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#12829, Voters_BirthDate#12831, Residence_Families_HHCount#12808, Residence_HHGender_Description#12809, Mailing_Families_HHCount#12826, Mailing_HHGender_Description#12827, Parties_Description#12833, CommercialData_PropertyType#13138, AddressDistricts_Change_Changed_CD#12848, AddressDistricts_Change_Changed_SD#12850, AddressDistricts_Change_Changed_HD#12852, AddressDistricts_Change_Changed_County#12861, Residence_Addresses_Density#12806, CommercialData_EstimatedHHIncome#13110, CommercialData_ISPSA#13121, CommercialData_AreaMedianEducationYears#13124, CommercialData_AreaMedianHousingValue#13125, CommercialData_AreaPcntHHMarriedCoupleNoChild#13131, CommercialData_AreaPcntHHMarriedCoupleWithChild#13132, CommercialData_AreaPcntHHSpanishSpeaking#13133, CommercialData_AreaPcntHHWithChildren#13134, CommercialData_StateIncomeDecile#13141, EthnicGroups_EthnicGroup1Desc#12836, CommercialData_DwellingType#13106, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#12764,LALVOTERID#12765,Voters_Active#12766,Voters_StateVoterID#12767,Voters_CountyVoterID#12768,VoterTelephones_LandlineAreaCode#12769,VoterTelephones_Landline7Digit#12770,VoterTelephones_LandlineFormatted#12771,VoterTelephones_LandlineUnformatted#12772,VoterTelephones_LandlineConfidenceCode#12773,VoterTelephones_CellPhoneOnly#12774,VoterTelephones_CellPhoneFormatted#12775,VoterTelephones_CellPhoneUnformatted#12776,VoterTelephones_CellConfidenceCode#12777,Voters_FirstName#12778,Voters_MiddleName#12779,Voters_LastName#12780,Voters_NameSuffix#12781,Residence_Addresses_AddressLine#12782,Residence_Addresses_ExtraAddressLine#12783,Residence_Addresses_City#12784,Residence_Addresses_State#12785,Residence_Addresses_Zip#12786,Residence_Addresses_ZipPlus4#12787,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#14429, Voters_BirthDate#14431, Residence_Families_HHCount#14408, Residence_HHGender_Description#14409, Mailing_Families_HHCount#14426, Mailing_HHGender_Description#14427, Parties_Description#14433, CommercialData_PropertyType#14738, AddressDistricts_Change_Changed_CD#14448, AddressDistricts_Change_Changed_SD#14450, AddressDistricts_Change_Changed_HD#14452, AddressDistricts_Change_Changed_County#14461, Residence_Addresses_Density#14406, CommercialData_EstimatedHHIncome#14710, CommercialData_ISPSA#14721, CommercialData_AreaMedianEducationYears#14724, CommercialData_AreaMedianHousingValue#14725, CommercialData_AreaPcntHHMarriedCoupleNoChild#14731, CommercialData_AreaPcntHHMarriedCoupleWithChild#14732, CommercialData_AreaPcntHHSpanishSpeaking#14733, CommercialData_AreaPcntHHWithChildren#14734, CommercialData_StateIncomeDecile#14741, EthnicGroups_EthnicGroup1Desc#14436, CommercialData_DwellingType#14706, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#14429, Voters_BirthDate#14431, Residence_Families_HHCount#14408, Residence_HHGender_Description#14409, Mailing_Families_HHCount#14426, Mailing_HHGender_Description#14427, Parties_Description#14433, CommercialData_PropertyType#14738, AddressDistricts_Change_Changed_CD#14448, AddressDistricts_Change_Changed_SD#14450, AddressDistricts_Change_Changed_HD#14452, AddressDistricts_Change_Changed_County#14461, Residence_Addresses_Density#14406, CommercialData_EstimatedHHIncome#14710, CommercialData_ISPSA#14721, CommercialData_AreaMedianEducationYears#14724, CommercialData_AreaMedianHousingValue#14725, CommercialData_AreaPcntHHMarriedCoupleNoChild#14731, CommercialData_AreaPcntHHMarriedCoupleWithChild#14732, CommercialData_AreaPcntHHSpanishSpeaking#14733, CommercialData_AreaPcntHHWithChildren#14734, CommercialData_StateIncomeDecile#14741, EthnicGroups_EthnicGroup1Desc#14436, CommercialData_DwellingType#14706, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#14364,LALVOTERID#14365,Voters_Active#14366,Voters_StateVoterID#14367,Voters_CountyVoterID#14368,VoterTelephones_LandlineAreaCode#14369,VoterTelephones_Landline7Digit#14370,VoterTelephones_LandlineFormatted#14371,VoterTelephones_LandlineUnformatted#14372,VoterTelephones_LandlineConfidenceCode#14373,VoterTelephones_CellPhoneOnly#14374,VoterTelephones_CellPhoneFormatted#14375,VoterTelephones_CellPhoneUnformatted#14376,VoterTelephones_CellConfidenceCode#14377,Voters_FirstName#14378,Voters_MiddleName#14379,Voters_LastName#14380,Voters_NameSuffix#14381,Residence_Addresses_AddressLine#14382,Residence_Addresses_ExtraAddressLine#14383,Residence_Addresses_City#14384,Residence_Addresses_State#14385,Residence_Addresses_Zip#14386,Residence_Addresses_ZipPlus4#14387,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#16029, Voters_BirthDate#16031, Residence_Families_HHCount#16008, Residence_HHGender_Description#16009, Mailing_Families_HHCount#16026, Mailing_HHGender_Description#16027, Parties_Description#16033, CommercialData_PropertyType#16338, AddressDistricts_Change_Changed_CD#16048, AddressDistricts_Change_Changed_SD#16050, AddressDistricts_Change_Changed_HD#16052, AddressDistricts_Change_Changed_County#16061, Residence_Addresses_Density#16006, CommercialData_EstimatedHHIncome#16310, CommercialData_ISPSA#16321, CommercialData_AreaMedianEducationYears#16324, CommercialData_AreaMedianHousingValue#16325, CommercialData_AreaPcntHHMarriedCoupleNoChild#16331, CommercialData_AreaPcntHHMarriedCoupleWithChild#16332, CommercialData_AreaPcntHHSpanishSpeaking#16333, CommercialData_AreaPcntHHWithChildren#16334, CommercialData_StateIncomeDecile#16341, EthnicGroups_EthnicGroup1Desc#16036, CommercialData_DwellingType#16306, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#16029, Voters_BirthDate#16031, Residence_Families_HHCount#16008, Residence_HHGender_Description#16009, Mailing_Families_HHCount#16026, Mailing_HHGender_Description#16027, Parties_Description#16033, CommercialData_PropertyType#16338, AddressDistricts_Change_Changed_CD#16048, AddressDistricts_Change_Changed_SD#16050, AddressDistricts_Change_Changed_HD#16052, AddressDistricts_Change_Changed_County#16061, Residence_Addresses_Density#16006, CommercialData_EstimatedHHIncome#16310, CommercialData_ISPSA#16321, CommercialData_AreaMedianEducationYears#16324, CommercialData_AreaMedianHousingValue#16325, CommercialData_AreaPcntHHMarriedCoupleNoChild#16331, CommercialData_AreaPcntHHMarriedCoupleWithChild#16332, CommercialData_AreaPcntHHSpanishSpeaking#16333, CommercialData_AreaPcntHHWithChildren#16334, CommercialData_StateIncomeDecile#16341, EthnicGroups_EthnicGroup1Desc#16036, CommercialData_DwellingType#16306, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#15964,LALVOTERID#15965,Voters_Active#15966,Voters_StateVoterID#15967,Voters_CountyVoterID#15968,VoterTelephones_LandlineAreaCode#15969,VoterTelephones_Landline7Digit#15970,VoterTelephones_LandlineFormatted#15971,VoterTelephones_LandlineUnformatted#15972,VoterTelephones_LandlineConfidenceCode#15973,VoterTelephones_CellPhoneOnly#15974,VoterTelephones_CellPhoneFormatted#15975,VoterTelephones_CellPhoneUnformatted#15976,VoterTelephones_CellConfidenceCode#15977,Voters_FirstName#15978,Voters_MiddleName#15979,Voters_LastName#15980,Voters_NameSuffix#15981,Residence_Addresses_AddressLine#15982,Residence_Addresses_ExtraAddressLine#15983,Residence_Addresses_City#15984,Residence_Addresses_State#15985,Residence_Addresses_Zip#15986,Residence_Addresses_ZipPlus4#15987,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#17629, Voters_BirthDate#17631, Residence_Families_HHCount#17608, Residence_HHGender_Description#17609, Mailing_Families_HHCount#17626, Mailing_HHGender_Description#17627, Parties_Description#17633, CommercialData_PropertyType#17938, AddressDistricts_Change_Changed_CD#17648, AddressDistricts_Change_Changed_SD#17650, AddressDistricts_Change_Changed_HD#17652, AddressDistricts_Change_Changed_County#17661, Residence_Addresses_Density#17606, CommercialData_EstimatedHHIncome#17910, CommercialData_ISPSA#17921, CommercialData_AreaMedianEducationYears#17924, CommercialData_AreaMedianHousingValue#17925, CommercialData_AreaPcntHHMarriedCoupleNoChild#17931, CommercialData_AreaPcntHHMarriedCoupleWithChild#17932, CommercialData_AreaPcntHHSpanishSpeaking#17933, CommercialData_AreaPcntHHWithChildren#17934, CommercialData_StateIncomeDecile#17941, EthnicGroups_EthnicGroup1Desc#17636, CommercialData_DwellingType#17906, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#17629, Voters_BirthDate#17631, Residence_Families_HHCount#17608, Residence_HHGender_Description#17609, Mailing_Families_HHCount#17626, Mailing_HHGender_Description#17627, Parties_Description#17633, CommercialData_PropertyType#17938, AddressDistricts_Change_Changed_CD#17648, AddressDistricts_Change_Changed_SD#17650, AddressDistricts_Change_Changed_HD#17652, AddressDistricts_Change_Changed_County#17661, Residence_Addresses_Density#17606, CommercialData_EstimatedHHIncome#17910, CommercialData_ISPSA#17921, CommercialData_AreaMedianEducationYears#17924, CommercialData_AreaMedianHousingValue#17925, CommercialData_AreaPcntHHMarriedCoupleNoChild#17931, CommercialData_AreaPcntHHMarriedCoupleWithChild#17932, CommercialData_AreaPcntHHSpanishSpeaking#17933, CommercialData_AreaPcntHHWithChildren#17934, CommercialData_StateIncomeDecile#17941, EthnicGroups_EthnicGroup1Desc#17636, CommercialData_DwellingType#17906, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#17564,LALVOTERID#17565,Voters_Active#17566,Voters_StateVoterID#17567,Voters_CountyVoterID#17568,VoterTelephones_LandlineAreaCode#17569,VoterTelephones_Landline7Digit#17570,VoterTelephones_LandlineFormatted#17571,VoterTelephones_LandlineUnformatted#17572,VoterTelephones_LandlineConfidenceCode#17573,VoterTelephones_CellPhoneOnly#17574,VoterTelephones_CellPhoneFormatted#17575,VoterTelephones_CellPhoneUnformatted#17576,VoterTelephones_CellConfidenceCode#17577,Voters_FirstName#17578,Voters_MiddleName#17579,Voters_LastName#17580,Voters_NameSuffix#17581,Residence_Addresses_AddressLine#17582,Residence_Addresses_ExtraAddressLine#17583,Residence_Addresses_City#17584,Residence_Addresses_State#17585,Residence_Addresses_Zip#17586,Residence_Addresses_ZipPlus4#17587,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#19229, Voters_BirthDate#19231, Residence_Families_HHCount#19208, Residence_HHGender_Description#19209, Mailing_Families_HHCount#19226, Mailing_HHGender_Description#19227, Parties_Description#19233, CommercialData_PropertyType#19538, AddressDistricts_Change_Changed_CD#19248, AddressDistricts_Change_Changed_SD#19250, AddressDistricts_Change_Changed_HD#19252, AddressDistricts_Change_Changed_County#19261, Residence_Addresses_Density#19206, CommercialData_EstimatedHHIncome#19510, CommercialData_ISPSA#19521, CommercialData_AreaMedianEducationYears#19524, CommercialData_AreaMedianHousingValue#19525, CommercialData_AreaPcntHHMarriedCoupleNoChild#19531, CommercialData_AreaPcntHHMarriedCoupleWithChild#19532, CommercialData_AreaPcntHHSpanishSpeaking#19533, CommercialData_AreaPcntHHWithChildren#19534, CommercialData_StateIncomeDecile#19541, EthnicGroups_EthnicGroup1Desc#19236, CommercialData_DwellingType#19506, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#19229, Voters_BirthDate#19231, Residence_Families_HHCount#19208, Residence_HHGender_Description#19209, Mailing_Families_HHCount#19226, Mailing_HHGender_Description#19227, Parties_Description#19233, CommercialData_PropertyType#19538, AddressDistricts_Change_Changed_CD#19248, AddressDistricts_Change_Changed_SD#19250, AddressDistricts_Change_Changed_HD#19252, AddressDistricts_Change_Changed_County#19261, Residence_Addresses_Density#19206, CommercialData_EstimatedHHIncome#19510, CommercialData_ISPSA#19521, CommercialData_AreaMedianEducationYears#19524, CommercialData_AreaMedianHousingValue#19525, CommercialData_AreaPcntHHMarriedCoupleNoChild#19531, CommercialData_AreaPcntHHMarriedCoupleWithChild#19532, CommercialData_AreaPcntHHSpanishSpeaking#19533, CommercialData_AreaPcntHHWithChildren#19534, CommercialData_StateIncomeDecile#19541, EthnicGroups_EthnicGroup1Desc#19236, CommercialData_DwellingType#19506, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#19164,LALVOTERID#19165,Voters_Active#19166,Voters_StateVoterID#19167,Voters_CountyVoterID#19168,VoterTelephones_LandlineAreaCode#19169,VoterTelephones_Landline7Digit#19170,VoterTelephones_LandlineFormatted#19171,VoterTelephones_LandlineUnformatted#19172,VoterTelephones_LandlineConfidenceCode#19173,VoterTelephones_CellPhoneOnly#19174,VoterTelephones_CellPhoneFormatted#19175,VoterTelephones_CellPhoneUnformatted#19176,VoterTelephones_CellConfidenceCode#19177,Voters_FirstName#19178,Voters_MiddleName#19179,Voters_LastName#19180,Voters_NameSuffix#19181,Residence_Addresses_AddressLine#19182,Residence_Addresses_ExtraAddressLine#19183,Residence_Addresses_City#19184,Residence_Addresses_State#19185,Residence_Addresses_Zip#19186,Residence_Addresses_ZipPlus4#19187,... 702 more fields] parquet
                                                                                                               :- Project [Voters_Gender#20829, Voters_BirthDate#20831, Residence_Families_HHCount#20808, Residence_HHGender_Description#20809, Mailing_Families_HHCount#20826, Mailing_HHGender_Description#20827, Parties_Description#20833, CommercialData_PropertyType#21138, AddressDistricts_Change_Changed_CD#20848, AddressDistricts_Change_Changed_SD#20850, AddressDistricts_Change_Changed_HD#20852, AddressDistricts_Change_Changed_County#20861, Residence_Addresses_Density#20806, CommercialData_EstimatedHHIncome#21110, CommercialData_ISPSA#21121, CommercialData_AreaMedianEducationYears#21124, CommercialData_AreaMedianHousingValue#21125, CommercialData_AreaPcntHHMarriedCoupleNoChild#21131, CommercialData_AreaPcntHHMarriedCoupleWithChild#21132, CommercialData_AreaPcntHHSpanishSpeaking#21133, CommercialData_AreaPcntHHWithChildren#21134, CommercialData_StateIncomeDecile#21141, EthnicGroups_EthnicGroup1Desc#20836, CommercialData_DwellingType#21106, ... 12 more fields]
                                                                                                               :  +- Project [Voters_Gender#20829, Voters_BirthDate#20831, Residence_Families_HHCount#20808, Residence_HHGender_Description#20809, Mailing_Families_HHCount#20826, Mailing_HHGender_Description#20827, Parties_Description#20833, CommercialData_PropertyType#21138, AddressDistricts_Change_Changed_CD#20848, AddressDistricts_Change_Changed_SD#20850, AddressDistricts_Change_Changed_HD#20852, AddressDistricts_Change_Changed_County#20861, Residence_Addresses_Density#20806, CommercialData_EstimatedHHIncome#21110, CommercialData_ISPSA#21121, CommercialData_AreaMedianEducationYears#21124, CommercialData_AreaMedianHousingValue#21125, CommercialData_AreaPcntHHMarriedCoupleNoChild#21131, CommercialData_AreaPcntHHMarriedCoupleWithChild#21132, CommercialData_AreaPcntHHSpanishSpeaking#21133, CommercialData_AreaPcntHHWithChildren#21134, CommercialData_StateIncomeDecile#21141, EthnicGroups_EthnicGroup1Desc#20836, CommercialData_DwellingType#21106, ... 11 more fields]
                                                                                                               :     +- Relation[SEQUENCE#20764,LALVOTERID#20765,Voters_Active#20766,Voters_StateVoterID#20767,Voters_CountyVoterID#20768,VoterTelephones_LandlineAreaCode#20769,VoterTelephones_Landline7Digit#20770,VoterTelephones_LandlineFormatted#20771,VoterTelephones_LandlineUnformatted#20772,VoterTelephones_LandlineConfidenceCode#20773,VoterTelephones_CellPhoneOnly#20774,VoterTelephones_CellPhoneFormatted#20775,VoterTelephones_CellPhoneUnformatted#20776,VoterTelephones_CellConfidenceCode#20777,Voters_FirstName#20778,Voters_MiddleName#20779,Voters_LastName#20780,Voters_NameSuffix#20781,Residence_Addresses_AddressLine#20782,Residence_Addresses_ExtraAddressLine#20783,Residence_Addresses_City#20784,Residence_Addresses_State#20785,Residence_Addresses_Zip#20786,Residence_Addresses_ZipPlus4#20787,... 702 more fields] parquet
                                                                                                               +- Project [Voters_Gender#22429, Voters_BirthDate#22431, Residence_Families_HHCount#22408, Residence_HHGender_Description#22409, Mailing_Families_HHCount#22426, Mailing_HHGender_Description#22427, Parties_Description#22433, CommercialData_PropertyType#22738, AddressDistricts_Change_Changed_CD#22448, AddressDistricts_Change_Changed_SD#22450, AddressDistricts_Change_Changed_HD#22452, AddressDistricts_Change_Changed_County#22461, Residence_Addresses_Density#22406, CommercialData_EstimatedHHIncome#22710, CommercialData_ISPSA#22721, CommercialData_AreaMedianEducationYears#22724, CommercialData_AreaMedianHousingValue#22725, CommercialData_AreaPcntHHMarriedCoupleNoChild#22731, CommercialData_AreaPcntHHMarriedCoupleWithChild#22732, CommercialData_AreaPcntHHSpanishSpeaking#22733, CommercialData_AreaPcntHHWithChildren#22734, CommercialData_StateIncomeDecile#22741, EthnicGroups_EthnicGroup1Desc#22436, CommercialData_DwellingType#22706, ... 12 more fields]
                                                                                                                  +- Project [Voters_Gender#22429, Voters_BirthDate#22431, Residence_Families_HHCount#22408, Residence_HHGender_Description#22409, Mailing_Families_HHCount#22426, Mailing_HHGender_Description#22427, Parties_Description#22433, CommercialData_PropertyType#22738, AddressDistricts_Change_Changed_CD#22448, AddressDistricts_Change_Changed_SD#22450, AddressDistricts_Change_Changed_HD#22452, AddressDistricts_Change_Changed_County#22461, Residence_Addresses_Density#22406, CommercialData_EstimatedHHIncome#22710, CommercialData_ISPSA#22721, CommercialData_AreaMedianEducationYears#22724, CommercialData_AreaMedianHousingValue#22725, CommercialData_AreaPcntHHMarriedCoupleNoChild#22731, CommercialData_AreaPcntHHMarriedCoupleWithChild#22732, CommercialData_AreaPcntHHSpanishSpeaking#22733, CommercialData_AreaPcntHHWithChildren#22734, CommercialData_StateIncomeDecile#22741, EthnicGroups_EthnicGroup1Desc#22436, CommercialData_DwellingType#22706, ... 11 more fields]
                                                                                                                     +- Relation[SEQUENCE#22364,LALVOTERID#22365,Voters_Active#22366,Voters_StateVoterID#22367,Voters_CountyVoterID#22368,VoterTelephones_LandlineAreaCode#22369,VoterTelephones_Landline7Digit#22370,VoterTelephones_LandlineFormatted#22371,VoterTelephones_LandlineUnformatted#22372,VoterTelephones_LandlineConfidenceCode#22373,VoterTelephones_CellPhoneOnly#22374,VoterTelephones_CellPhoneFormatted#22375,VoterTelephones_CellPhoneUnformatted#22376,VoterTelephones_CellConfidenceCode#22377,Voters_FirstName#22378,Voters_MiddleName#22379,Voters_LastName#22380,Voters_NameSuffix#22381,Residence_Addresses_AddressLine#22382,Residence_Addresses_ExtraAddressLine#22383,Residence_Addresses_City#22384,Residence_Addresses_State#22385,Residence_Addresses_Zip#22386,Residence_Addresses_ZipPlus4#22387,... 702 more fields] parquet


In [None]:
coefsArray = np.array(lrModel.coefficients)  # convert to np.array
coefsDF = pd.DataFrame(coefsArray, columns=['coefs'])  # to pandas

coefsDF = coefsDF.merge(featureCols, left_index=True, right_index=True)  # join it with featureCols we created above
coefsDF.sort_values('coefs', inplace=True)  # Sort them
coefsDF.head()

plt.rcParams["figure.figsize"] = (20,3)

plt.xticks(rotation=90)
plt.bar(coefsDF.name, coefsDF.coefs)
plt.title('Ranked coefficients from the logistic regression model')
plt.show()

df_ref.write.format("parquet").save("total_reference_sample")
df_ref = spark.read.parquet("total_reference_sample")

# Propensity score estimator

In [None]:
num_voters_indiana = indi.count()
num_voters_not_indiana = df_ref.count()
pct_sample = num_voters_indiana / num_voters_not_indiana

# empty list to store the estimated average treatment effects:
ATEs = []

# empty dictionary to store output
stored_DF = {}

# start_time = time.time()

for i in range(1):
    i = i + 1 # from 1 - 100 rather than 0 to 100
    
    print(f"iteration {i}")

    # take random sample of the total parquet file (equivalent to the size of indiana)
    df_ref_sampled = df_ref.sample(True, pct_sample, seed = i)

    # create dummy DATA:
#     indi = indi.withColumn('Voters_Age', rand())
#     df_ref = df_ref.withColumn('Voters_Age', rand())
    
#     indi = indi.select(["Voters_Age"])
#     df_ref = df_ref.select(["Voters_Age"])

#     indi = indi.withColumn('Voters_Age', col('Voters_Age').cast('double'))
#     df_ref = df_ref.withColumn('Voters_Age', col('Voters_Age').cast('double'))

#     indi = indi.withColumn('General_2008_RANDOM', when(rand() > 0.5, 1).otherwise(0))
#     df_ref = df_ref.withColumn('General_2008_RANDOM', when(rand() > 0.5, 1).otherwise(0))

    # create a column with "LAW == 0" for non-Indiana states
    df_ref_sampled = df_ref_sampled.withColumn("LAW", lit(0))

    # create a column with "LAW == 1" for Indiana
    indi = indi.withColumn("LAW", lit(1))

    # union the two together
    df = df_ref_sampled.union(indi)

    cols_excluded_from_regression = [
        'Voters_BirthDate', # removed this, but KEPT the YEAR that the voter turned 18.
        'STATE',
        'STATE_ind',
        'DATE_18',
        'comparator_date_presidential',
        'comparator_date_primary',
        'YEAR_ELIGIBLE_TO_VOTE_PRESIDENTIAL',
        'YEAR_ELIGIBLE_TO_VOTE_PRIMARY',
        'Voted_General_2008'
    ]
    
    df_input_logistic = df.drop(*cols_excluded_from_regression)

    # fit logistic model on the intervention (variable Law)
    nrow_df_input_logistic_start_check = df_input_logistic.count()
#     df_input_logistic = df_input_logistic.fillna(0)
    df_input_logistic.columns
    supervised = RFormula(formula="LAW ~ .")
    fittedRF = supervised.fit(df_input_logistic) # inspect column types
    prepareddf_input_logistic = fittedRF.transform(df_input_logistic) # create feature and label columns
    prepareddf_input_logistic.show(5, truncate = False)
    lr = LogisticRegression(labelCol="label",featuresCol="features")
    print(lr.explainParams())
    lrModel = lr.fit(prepareddf_input_logistic) # train model
    lrModel.transform(prepareddf_input_logistic).select("label", "prediction") # fitted values

    # get a propensity score from the probability as a new column:
    fitted = lrModel.transform(prepareddf_input_logistic)
    fitted = fitted.withColumn('probability', vector_to_array('probability'))
    array_mean = udf(lambda x: float(np.mean(x)), FloatType())
    fitted = fitted.withColumn("propensity_score", array_mean("probability"))
    try:
        assert fitted.count() == nrow_df_input_logistic_start_check
    except:
        Exception("before and after don't have same number of columns")

    # new column that called weight that is T - PS / (PS * 1 - PS)
    fitted = fitted.withColumn("weight", (col("label") - col("propensity_score")) / (col("propensity_score") * (1-col("propensity_score"))))

    # merge back in the 2008 general election OUTCOME data
    fitted = fitted.withColumn("row_id", monotonically_increasing_id())
    df = df.withColumn("row_id", monotonically_increasing_id())
    num_row_df_prior = df.count()
    df = df.join(fitted, ["row_id", "Voters_Age", "LAW"]).drop("row_id")
    
    try:
        assert num_row_df_prior == df.count()
    except:
        Exception("before and after don't have same number of columns")

    # calculate the weighted average
    df = df.withColumn("weighted_outcome", col("General_2008_RANDOM") * col("weight"))

    # store weighted average into list
    ATE_this_round = df.agg(avg(col("weighted_outcome"))).collect()[0][0]
    ATEs.append(ATE_this_round)

    # store the DataFrame into a dictionary
    stored_DF[f"{i}"] = df
    
# end_time = time.time()
# print("Execution time: {:.2f} seconds".format(end_time - start_time))



KeyboardInterrupt: 

In [None]:
df.show(10)

In [None]:
ATEs

In [15]:
range(1)

range(0, 1)

In [None]:
start_time = time.time()
end_time = time.time()
print("Execution time: {:.2f} seconds".format(end_time - start_time))