# The following data exploration compares Kingmaker's outputed political sentiment classification datasets against the prior Washington State 3/2024 voting census data. The full dataset for Spokane County is interrogating the Kingmaker sets to get a baseline for accuracy.

In [1]:
#-- Importing the nessesary libraries --#
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# Importing the Spokane county reported sentiment from the 3/2024 Washington State election survey. 


In [2]:
#-- Importing the dataset of identified Democrats --#

county = pd.read_csv('/Users/michaelsegaline/Desktop/2024-3 Presidential Matchback/Spokane.csv')

In [3]:
#-- Inspecting the shape of the county dataset --#
county.shape

(134667, 21)

In [4]:
#-- Inspecting the columns and data types --#
county.dtypes

Ballot ID             int64
Voter ID              int64
County               object
First Name           object
Last Name            object
Gender               object
Election             object
Ballot Status        object
Challenge Reason     object
Sent Date            object
Received Date        object
Address              object
City                 object
State                object
Zip                  object
Country              object
Split               float64
Precinct              int64
Return Method        object
Return Location      object
Party                object
dtype: object

In [5]:
county.columns

Index(['Ballot ID', 'Voter ID', 'County', 'First Name', 'Last Name', 'Gender',
       'Election', 'Ballot Status', 'Challenge Reason', 'Sent Date',
       'Received Date', 'Address', 'City', 'State', 'Zip', 'Country', 'Split',
       'Precinct', 'Return Method', 'Return Location', 'Party'],
      dtype='object')

# Take note of the baseline data sparcity % of the county data set.

In [6]:
#-- Now getting a data sparsity percentage per column --# 
#-- Calculate the percentage of missing values per column --#
missing_percentage = county.isnull().mean() * 100

#-- Print the results --#
print(missing_percentage)

Ballot ID            0.000000
Voter ID             0.000000
County               0.000000
First Name           0.000743
Last Name            0.000743
Gender               1.563115
Election             0.000000
Ballot Status        0.000000
Challenge Reason    96.965107
Sent Date            0.000000
Received Date        0.000000
Address              0.000000
City                 0.002970
State                0.166336
Zip                  0.167821
Country              0.000000
Split                0.000000
Precinct             0.000000
Return Method        0.000000
Return Location     63.486229
Party                0.000000
dtype: float64


# Now splitting the dataset into two seperate sets of identified Republicans and Democrats

In [7]:
#-- Step 1: Drop all rows that are not labeled "REP" or "DEM" in the "Party" column --#
lab_filtered = county[county['Party'].isin(['REP', 'DEM'])]

#-- Step 2: Create new datasets containing only rows labeled "REP" --#
dem = lab_filtered[lab_filtered['Party'] == 'DEM']
rep = lab_filtered[lab_filtered['Party']=='REP']

In [8]:
#-- Inpestcing the shape of the democrat dataset --#
dem.shape

(53777, 21)

In [9]:
#-- Inspecting the shape of republican dataset --#
rep.shape

(78294, 21)

# 53,777
Instances of self-identified democrats in Spokane County.

# 78294

Instances of self-identified republicans.

# Exploring Kingmaker sentiment classifications starting with 'Likely Democrat'

In [10]:
#-- Importing the first dataset--#
lkdem_raw = pd.read_csv('/Users/michaelsegaline/Desktop/Washington GOP/King Maker Sentiment Data/Kingmakerdata Dems Likely.csv')
lkdem_raw.shape

(57244, 28)

In [11]:
lkdem_raw.columns

Index(['lal_voter_id', 'vu_id', 'user_id', 'first_name', 'middle_name',
       'last_name', 'address', 'city', 'city_mailing', 'zipcode', 'state',
       'phone1', 'phone2', 'precinct', 'turnout_prediction', 'contact_visit',
       'contact_call', 'contact_text', 'contact_digital', 'contact_mail',
       'support_status', 'tags', 'contact_visit_count', 'contact_call_count',
       'contact_text_count', 'contact_digital_count', 'contact_mail_count',
       'top_issue'],
      dtype='object')

In [12]:
lkdem_raw.dtypes

lal_voter_id             object
vu_id                    object
user_id                  object
first_name               object
middle_name              object
last_name                object
address                  object
city                     object
city_mailing             object
zipcode                  object
state                    object
phone1                   object
phone2                   object
precinct                 object
turnout_prediction       object
contact_visit            object
contact_call             object
contact_text             object
contact_digital          object
contact_mail             object
support_status           object
tags                     object
contact_visit_count      object
contact_call_count        int64
contact_text_count        int64
contact_digital_count     int64
contact_mail_count        int64
top_issue                object
dtype: object

# Now seperating the Wasington State Voter ID

In [13]:

# Extract the trailing 8 digits and assign to a new column
lkdem_raw['Voter ID'] = lkdem_raw['vu_id'].str[3:]

lkdem_raw.columns


Index(['lal_voter_id', 'vu_id', 'user_id', 'first_name', 'middle_name',
       'last_name', 'address', 'city', 'city_mailing', 'zipcode', 'state',
       'phone1', 'phone2', 'precinct', 'turnout_prediction', 'contact_visit',
       'contact_call', 'contact_text', 'contact_digital', 'contact_mail',
       'support_status', 'tags', 'contact_visit_count', 'contact_call_count',
       'contact_text_count', 'contact_digital_count', 'contact_mail_count',
       'top_issue', 'Voter ID'],
      dtype='object')

In [14]:
lkdem_raw['Voter ID']

0        08626082
1        01791479
2        10803120
3        11368047
4        00375322
           ...   
57239    11432717
57240    10492221
57241    09676606
57242    01643415
57243    00732505
Name: Voter ID, Length: 57244, dtype: object

In [15]:
#-- Now getting a data sparsity percentage per column --# 
#-- Calculate the percentage of missing values per column --#
missing_percentage = lkdem_raw.isnull().mean() * 100

#-- Print the results --#
print(missing_percentage)

lal_voter_id              0.000000
vu_id                     0.000000
user_id                   0.000000
first_name                0.000000
middle_name              11.927888
last_name                 0.006988
address                   0.000000
city                      0.031444
city_mailing              4.119209
zipcode                   0.246314
state                     0.008735
phone1                   70.966389
phone2                   63.753406
precinct                  0.006988
turnout_prediction        0.000000
contact_visit            99.262805
contact_call              0.632381
contact_text             97.599748
contact_digital          97.784921
contact_mail              0.632381
support_status           98.960590
tags                     11.547062
contact_visit_count       0.003494
contact_call_count        0.000000
contact_text_count        0.000000
contact_digital_count     0.000000
contact_mail_count        0.000000
top_issue                99.236601
Voter ID            

Now engineering the label column for the democrat likely dataset.

In [16]:
#-- Exploring the columns --#
lkdem = lkdem_raw
lkdem.columns

Index(['lal_voter_id', 'vu_id', 'user_id', 'first_name', 'middle_name',
       'last_name', 'address', 'city', 'city_mailing', 'zipcode', 'state',
       'phone1', 'phone2', 'precinct', 'turnout_prediction', 'contact_visit',
       'contact_call', 'contact_text', 'contact_digital', 'contact_mail',
       'support_status', 'tags', 'contact_visit_count', 'contact_call_count',
       'contact_text_count', 'contact_digital_count', 'contact_mail_count',
       'top_issue', 'Voter ID'],
      dtype='object')

In [17]:
lkdem.head()

Unnamed: 0,lal_voter_id,vu_id,user_id,first_name,middle_name,last_name,address,city,city_mailing,zipcode,...,contact_mail,support_status,tags,contact_visit_count,contact_call_count,contact_text_count,contact_digital_count,contact_mail_count,top_issue,Voter ID
0,5307182420,WA008626082,61e9a5a1-5730-4710-85c3-f17248870d0c,Sef,,Magrath,1419 E 7th Ave,SPOKANE,Spokane,99202,...,No,,City of Spokane;Voted 2023 General,0,0,0,0,0,,8626082
1,5306449960,WA001791479,61e9a5a1-5730-4710-85c3-f17248870d0c,Christine,M,Imes,1212 E Celesta Ave,SPOKANE,Spokane,99202,...,No,,City of Spokane;Voted 2023 General,0,0,0,0,0,,1791479
2,5309709496,WA010803120,61e9a5a1-5730-4710-85c3-f17248870d0c,Holly,Anna,Robertson,743 S Scott St Apt 105,SPOKANE,,99202,...,No,,City of Spokane,0,0,0,0,0,,10803120
3,5310056329,WA011368047,61e9a5a1-5730-4710-85c3-f17248870d0c,Joseph,J,Pierce,1204 E Nina Ave,SPOKANE,Spokane,99202,...,No,,City of Spokane;Voted 2023 General,0,0,0,0,0,,11368047
4,5306836456,WA000375322,61e9a5a1-5730-4710-85c3-f17248870d0c,Richard,Ross,Peterson,1122 E 5th Ave,SPOKANE,Spokane,99202,...,No,,City of Spokane;Voted 2023 General;Levy Text 1,0,0,0,0,0,,375322


In [18]:
#-- Exploring the shape of the dataset --#
lkdem.columns

Index(['lal_voter_id', 'vu_id', 'user_id', 'first_name', 'middle_name',
       'last_name', 'address', 'city', 'city_mailing', 'zipcode', 'state',
       'phone1', 'phone2', 'precinct', 'turnout_prediction', 'contact_visit',
       'contact_call', 'contact_text', 'contact_digital', 'contact_mail',
       'support_status', 'tags', 'contact_visit_count', 'contact_call_count',
       'contact_text_count', 'contact_digital_count', 'contact_mail_count',
       'top_issue', 'Voter ID'],
      dtype='object')

# Now  comparing the county identified democrats against the Kingmaker "Likely Democrat" classifications.

In [19]:

dem['Voter ID'] = dem['Voter ID'].astype(int)
lkdem['Voter ID'] = lkdem['Voter ID'].astype(int)

# Perform the merge
merged = pd.merge(dem, lkdem, left_on=['Voter ID'], right_on=['Voter ID'])

# Merge datasets on the corresponding columns using the following variables --#
merged = pd.merge(dem,lkdem, left_on=['Voter ID'], right_on=['Voter ID'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dem['Voter ID'] = dem['Voter ID'].astype(int)


In [20]:
#-- Inspecting the shape of the merged dataset --#
merged.shape

(27744, 49)

There are 400 "Likely democrats" that successfully matched with the confirmed county democrats.

In [21]:
#-- Now dividing the Kingmaker 'Likely Democrat' prediction against the confirmed county democrats --# 
27744/ 53777

0.5159082879297842

# 0.51 or 51%
The accuracy score of Kingmaker's 'Likely Democrat' classification.

# Exploring Kingmaker's  "Strong Republican" classification accuracy 

In [22]:
#-- Importing the raw Kingmaker Data of Republicans --#

srep = pd.read_csv('/Users/michaelsegaline/Desktop/Washington GOP/King Maker Sentiment Data/Kingmakerdata Strong Reps.csv')
srep

  srep = pd.read_csv('/Users/michaelsegaline/Desktop/Washington GOP/King Maker Sentiment Data/Kingmakerdata Strong Reps.csv')


Unnamed: 0,lal_voter_id,vu_id,user_id,first_name,middle_name,last_name,address,city,city_mailing,zipcode,...,contact_mail,support_status,tags,contact_visit_count,contact_call_count,contact_text_count,contact_digital_count,contact_mail_count,top_issue,Unnamed: 28
0,5302213820,WA000677706,61e9a5a1-5730-4710-85c3-f17248870d0c,Donald,J,Moody,1224 E Celesta Ave Apt A,SPOKANE,Spokane,99202,...,No,,City of Spokane;Voted 2023 General,0,0,0,0,0,,
1,5303751404,WA000199781,61e9a5a1-5730-4710-85c3-f17248870d0c,Heidi,N,Kolarsky,1212 E Nina Ave,SPOKANE,Spokane,99202,...,No,,City of Spokane;Slavic;Voted 2023 General,0,0,0,0,0,,
2,5310592398,WA011883261,61e9a5a1-5730-4710-85c3-f17248870d0c,Alexander,Jeremy,Brooks,716 S Arthur St Apt 24,SPOKANE,Spokane,99202,...,No,,City of Spokane;Voted 2023 General,0,0,0,0,0,,
3,5302146818,WA001792940,61e9a5a1-5730-4710-85c3-f17248870d0c,John,K,Adolfson,1118 E 8th Ave,SPOKANE,Spokane,99202,...,No,,City of Spokane;Voted 2023 General,0,0,0,0,0,,
4,5307290197,WA009669629,61e9a5a1-5730-4710-85c3-f17248870d0c,Daniel,V,Kovalyov,532 S Garfield St Unit 9,SPOKANE,Spokane,99202,...,No,,City of Spokane;Voted 2023 General,0,0,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82653,5309369089,WA010426777,61e9a5a1-5730-4710-85c3-f17248870d0c,Abigail,Jeanine,Barsness,530 W 3rd St,CHENEY,Cheney,99004,...,No,,Voted 2023 General,0.0,0,0,0,0,,
82654,5302267646,WA001268891,61e9a5a1-5730-4710-85c3-f17248870d0c,Regina,D,Ross,206 W 3rd St,CHENEY,Cheney,99004,...,No,,Voted 2023 General,0.0,0,0,0,0,,
82655,5305931535,WA004626024,61e9a5a1-5730-4710-85c3-f17248870d0c,Michael,,Christenson,165 S Alki Ln,CHENEY,Cheney,99004,...,No,,,0.0,0,0,0,0,,
82656,5302086680,WA000696028,61e9a5a1-5730-4710-85c3-f17248870d0c,Brenda,L,Vanmatre,20015 S Cheney Plaza Rd,CHENEY,Cheney,99004,...,No,,Voted 2023 General,0.0,0,0,0,0,,


In [23]:
#-- Now getting a data sparsity percentage per column --# 
#-- Calculate the percentage of missing values per column --#
missing_percentage = srep.isnull().mean() * 100

#-- Print the results --#
print(missing_percentage)

lal_voter_id              0.000000
vu_id                     0.000000
user_id                   0.000000
first_name                0.000000
middle_name              11.848823
last_name                 0.004839
address                   0.000000
city                      0.056861
city_mailing              5.745360
zipcode                   0.114931
state                     0.022986
phone1                   69.649641
phone2                   61.350384
precinct                  0.085896
turnout_prediction        0.001210
contact_visit            96.387524
contact_call              0.597643
contact_text             64.847928
contact_digital          74.375136
contact_mail              0.597643
support_status           95.011977
tags                     17.341334
contact_visit_count       0.015727
contact_call_count        0.001210
contact_text_count        0.000000
contact_digital_count     0.000000
contact_mail_count        0.000000
top_issue                98.175615
Unnamed: 28         

In [47]:
srep.shape

(82658, 30)

In [25]:
# Extract the trailing 8 digits and assign to a new column
srep['Voter ID'] = srep['vu_id'].str[3:]

srep.columns


Index(['lal_voter_id', 'vu_id', 'user_id', 'first_name', 'middle_name',
       'last_name', 'address', 'city', 'city_mailing', 'zipcode', 'state',
       'phone1', 'phone2', 'precinct', 'turnout_prediction', 'contact_visit',
       'contact_call', 'contact_text', 'contact_digital', 'contact_mail',
       'support_status', 'tags', 'contact_visit_count', 'contact_call_count',
       'contact_text_count', 'contact_digital_count', 'contact_mail_count',
       'top_issue', 'Unnamed: 28', 'Voter ID'],
      dtype='object')

In [26]:
rep['Voter ID'] = rep['Voter ID'].astype(int)
srep['Voter ID'] = srep['Voter ID'].astype(int)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rep['Voter ID'] = rep['Voter ID'].astype(int)


# Now concatinating and comparing the confirmed county republicans  against Kingmaker's "Strong Republican" classifications.

In [27]:
#-- Merge datasets on the corresponding columns --#
merged2 = pd.merge(rep,srep, left_on=['Voter ID'], right_on=['Voter ID'])

In [28]:
#-- Inpspecting the shape--#
merged2.shape

(40779, 50)

In [29]:
40779 / 78294

0.5208445091577899

# 0.52 or 52% accurate at predicting Strong Republicans

In [30]:
#Emerged.to_csv('/Users/michaelsegaline/Desktop/Washington GOP/King Maker Sentiment Data/cleaned kingmaker data/repYes.csv')

# Exploring Kingmaker's "Strong Democrat" classification accuracy

In [31]:
#-- Importing the raw Kingmaker Data of Democrats --#

sdem = pd.read_csv('/Users/michaelsegaline/Desktop/Washington GOP/King Maker Sentiment Data/Kingmaker Strong Dem.csv')
sdem

  sdem = pd.read_csv('/Users/michaelsegaline/Desktop/Washington GOP/King Maker Sentiment Data/Kingmaker Strong Dem.csv')


Unnamed: 0,lal_voter_id,vu_id,user_id,first_name,middle_name,last_name,address,city,city_mailing,zipcode,...,contact_digital,contact_mail,support_status,tags,contact_visit_count,contact_call_count,contact_text_count,contact_digital_count,contact_mail_count,top_issue
0,5304702442,WA000694308,61e9a5a1-5730-4710-85c3-f17248870d0c,Frances,A,Awbery,11115 E Dean Ave,SPOKANE VALLEY,Spokane Valley,99206,...,,No,,Central Valley SD,0.0,0,0,0,0,
1,5310893682,WA012047102,61e9a5a1-5730-4710-85c3-f17248870d0c,Julia,Camille,Bjerkestrand,432 W Glass Ave,SPOKANE,Spokane,99205,...,,No,,City of Spokane;Voted 2023 General,0.0,0,0,0,0,
2,5311258498,WA011956959,61e9a5a1-5730-4710-85c3-f17248870d0c,Christopher,Ray,Levi,17606 E Mission Ave,SPOKANE VALLEY,Spokane Valley,99016,...,,No,,Central Valley SD,0.0,0,0,0,0,
3,5304463241,WA003117606,61e9a5a1-5730-4710-85c3-f17248870d0c,Sarah,A,Murphy,3410 E 13th Ave,SPOKANE,Spokane,99202,...,,No,,City of Spokane;Voted 2023 General,0.0,0,0,0,0,
4,5308161074,WA008390791,61e9a5a1-5730-4710-85c3-f17248870d0c,Donna,Ann,Fields,21655 E Rockrose Ln,LIBERTY LAKE,,99019,...,,No,,Central Valley SD,0.0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36694,5309633960,WA010699838,61e9a5a1-5730-4710-85c3-f17248870d0c,Cynthia,Louise,Masri,7506 N Panorama Dr,SPOKANE,Spokane,99208.0,...,,No,,City of Spokane;Voted 2023 General,0.0,0,0,0,0,
36695,5302084421,WA005134633,61e9a5a1-5730-4710-85c3-f17248870d0c,Debra,L,Tobler,8617 N Hill N Dale St,SPOKANE,Spokane,99208.0,...,,No,,City of Spokane;Voted 2023 General;Levy Text 1,0.0,0,0,0,0,
36696,5309734672,WA010799067,61e9a5a1-5730-4710-85c3-f17248870d0c,Garrett,P,Stenehjem,1927 E 15th Ave,SPOKANE,Spokane,99203.0,...,,No,,City of Spokane;Voted 2023 General,0.0,0,0,0,0,
36697,5304700983,WA000250504,61e9a5a1-5730-4710-85c3-f17248870d0c,Roberta,R,Mackin,7007 N Wiscomb St Apt 210,SPOKANE,Spokane,99208.0,...,,No,,City of Spokane,0.0,0,0,0,0,


# Again, notice how the sparisty % of the Kingmaker's classified democrats, is the approximent ratio to the county's variable sparsity percentage 

In [32]:
#-- Now getting a data sparsity percentage per column --# 
#-- Calculate the percentage of missing values per column --#
missing_percentage = sdem.isnull().mean() * 100

#-- Print the results --#
print(missing_percentage)

lal_voter_id              0.000000
vu_id                     0.000000
user_id                   0.000000
first_name                0.000000
middle_name              11.468977
last_name                 0.008175
address                   0.000000
city                      0.049048
city_mailing              6.425243
zipcode                   0.253413
state                     0.002725
phone1                   69.132674
phone2                   68.898335
precinct                  0.000000
turnout_prediction        0.000000
contact_visit            99.457751
contact_call              0.673043
contact_text             99.700264
contact_digital          99.307883
contact_mail              0.673043
support_status           99.215237
tags                     11.842285
contact_visit_count       0.002725
contact_call_count        0.000000
contact_text_count        0.000000
contact_digital_count     0.000000
contact_mail_count        0.000000
top_issue                99.411428
dtype: float64


In [46]:
#-- Inspecting the shape of the dataset --#
sdem.shape

(36699, 29)

In [33]:
# Extract the trailing 8 digits and assign to a new column
sdem['Voter ID'] = sdem['vu_id'].str[3:]

sdem.columns


Index(['lal_voter_id', 'vu_id', 'user_id', 'first_name', 'middle_name',
       'last_name', 'address', 'city', 'city_mailing', 'zipcode', 'state',
       'phone1', 'phone2', 'precinct', 'turnout_prediction', 'contact_visit',
       'contact_call', 'contact_text', 'contact_digital', 'contact_mail',
       'support_status', 'tags', 'contact_visit_count', 'contact_call_count',
       'contact_text_count', 'contact_digital_count', 'contact_mail_count',
       'top_issue', 'Voter ID'],
      dtype='object')

In [34]:
 dem['Voter ID'] = dem['Voter ID'].astype(int)
sdem['Voter ID'] = sdem['Voter ID'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dem['Voter ID'] = dem['Voter ID'].astype(int)


# Now concatinating and comparing the county identified democrats against the Kingmaker "Strong Democrat" classifications.

In [35]:
# Merge datasets on the corresponding columns
merged3 = pd.merge(dem,sdem, left_on=['Voter ID'], right_on=['Voter ID'])

There are 534 "Likely democrats" that successfully matched with the confirmed county democrats.

In [36]:
merged3.shape

(17070, 49)

In [37]:
17070 / 53777

0.3174219461851721

# 0.31 or 31%
Kingmaker's classification accuracy to predict "Strong Democrat".

# Now exploring Kingmaker's "Likely  Republican" classification.

In [38]:
#-- Importing the raw Kingmaker Data of Republicans --#

lkrep = pd.read_csv('/Users/michaelsegaline/Desktop/Washington GOP/King Maker Sentiment Data/Kingmakerdata Possible Reps.csv')
lkrep

Unnamed: 0,lal_voter_id,vu_id,user_id,first_name,middle_name,last_name,address,city,city_mailing,zipcode,...,contact_digital,contact_mail,support_status,tags,contact_visit_count,contact_call_count,contact_text_count,contact_digital_count,contact_mail_count,top_issue
0,5307260047,WA008801013,61e9a5a1-5730-4710-85c3-f17248870d0c,Lev,Nikolay,Starovoytov,1111 E 9th Ave,SPOKANE,Spokane,99202,...,,No,,City of Spokane,0.0,0,0,0,0,
1,5310943454,WA012083785,61e9a5a1-5730-4710-85c3-f17248870d0c,Joshua,Lloyd,Miller,816 E 8th Ave,SPOKANE,,99202,...,,No,,City of Spokane,0.0,0,0,0,0,
2,5308140474,WA009604273,61e9a5a1-5730-4710-85c3-f17248870d0c,Tony,,Swisher,926 E 8th Ave Apt 207,SPOKANE,Spokane,99202,...,,No,,City of Spokane;Voted 2023 General,0.0,0,0,0,0,
3,5307342773,WA008615032,61e9a5a1-5730-4710-85c3-f17248870d0c,James,Lawrence,Munroe,1203 E 14th Ave,SPOKANE,Spokane,99202,...,,No,,City of Spokane,0.0,0,0,0,0,
4,5305912398,WA004677131,61e9a5a1-5730-4710-85c3-f17248870d0c,Jeremiah,Joseph,Supon,1018 E 13th Ave,SPOKANE,Spokane,99202,...,,No,,City of Spokane;Voted 2023 General;Levy Text 1,0.0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10772,5304018422,WA000021161,61e9a5a1-5730-4710-85c3-f17248870d0c,Justin,Alan,Ulbright,354 Nolan Brown Pl,CHENEY,Cheney,99004,...,,No,,Voted 2023 General,0.0,0,0,0,0,
10773,5304670392,WA000387240,61e9a5a1-5730-4710-85c3-f17248870d0c,Joshua,Cole,Alent,449 N 4th St,CHENEY,Cheney,99004,...,Yes,No,,Voted 2023 General,0.0,0,0,0,0,
10774,5303735507,WA000210557,61e9a5a1-5730-4710-85c3-f17248870d0c,Christopher,Scott,Hoppe,403 Simpson Pkwy,CHENEY,Cheney,99004,...,,No,,Voted 2023 General,0.0,0,0,0,0,
10775,5302084123,WA001512720,61e9a5a1-5730-4710-85c3-f17248870d0c,Gavin,,Peterson,718 Lincoln St,CHENEY,Cheney,99004,...,,No,,Voted 2023 General,0.0,0,0,0,0,


In [39]:
lkrep.shape

(10777, 28)

# Again, notice how the column sparisty % ratios match the county.

In [40]:
#-- Now getting a data sparsity percentage per column --# 
#-- Calculate the percentage of missing values per column --#
missing_percentage = lkrep.isnull().mean() * 100

#-- Print the results --#
print(missing_percentage)

lal_voter_id              0.000000
vu_id                     0.000000
user_id                   0.000000
first_name                0.000000
middle_name               9.186230
last_name                 0.018558
address                   0.000000
city                      0.083511
city_mailing              9.937831
zipcode                   0.176301
state                     0.018558
phone1                   59.831122
phone2                   77.795305
precinct                  0.027837
turnout_prediction        0.000000
contact_visit            96.872970
contact_call              0.612415
contact_text             74.705391
contact_digital          84.225666
contact_mail              0.612415
support_status           95.768767
tags                     26.797810
contact_visit_count       0.018558
contact_call_count        0.000000
contact_text_count        0.000000
contact_digital_count     0.000000
contact_mail_count        0.000000
top_issue                98.626705
dtype: float64


In [41]:
#-- Extracting the trailing 8 digits and assign to a new column --#
lkrep['Voter ID'] = lkrep['vu_id'].str[3:]

lkrep.columns

Index(['lal_voter_id', 'vu_id', 'user_id', 'first_name', 'middle_name',
       'last_name', 'address', 'city', 'city_mailing', 'zipcode', 'state',
       'phone1', 'phone2', 'precinct', 'turnout_prediction', 'contact_visit',
       'contact_call', 'contact_text', 'contact_digital', 'contact_mail',
       'support_status', 'tags', 'contact_visit_count', 'contact_call_count',
       'contact_text_count', 'contact_digital_count', 'contact_mail_count',
       'top_issue', 'Voter ID'],
      dtype='object')

In [42]:
 #dem['Voter ID'] = dem['Voter ID'].astype(int)
lkrep['Voter ID'] = lkrep['Voter ID'].astype(int)

# Now merging and comparing the comfirmed county repubilicans against the Kingmaker "Likely Republican" classifications.

In [43]:
#-- Merge datasets on the corresponding columns --#
merged4 = pd.merge(rep,lkrep, left_on=['Voter ID'], right_on=['Voter ID'])

There are 4125 "Likely Republicans" that successfully matched with the confirmed county democrats.

In [44]:
merged3.shape

(17070, 49)

In [45]:
17070 / 78294

0.21802436968350064

# 0.21 or 21% 

Kingmaker accuracy at classifying "Likely Republican".