In [7]:
# county_adjacencies.csv cleaning

df2 = pd.read_csv("data/county_adjacencies.csv")

print(df2.head())
print(df2.isnull().sum())

           County  Population2022   FIPS  District                   N1  \
0        Accomack           33191  51001         2       Northumberland   
1       Albemarle          114534  51003         5  CharlottesvilleCity   
2  AlexandriaCity          155525  51510         8              Fairfax   
3       Alleghany           14835  51005         6                Craig   
4          Amelia           13455  51007         5         PrinceEdward   

           N2         N3             N4           N5        N6      N7  \
0   Lancaster  Middlesex        Mathews  Northampton       NaN     NaN   
1      Nelson    Augusta     Rockingham       Greene    Orange  Louisa   
2   Arlington        NaN            NaN          NaN       NaN     NaN   
3        Bath  Botetourt  CovingtonCity   Rockbridge       NaN     NaN   
4  Cumberland   Powhatan   Chesterfield    Dinwiddie  Nottoway     NaN   

         N8          N9     N10  N11  N12  
0       NaN         NaN     NaN  NaN  NaN  
1  Fluvanna  Buc

In [8]:
# Lots of NaNs for the adjacent neighborhoods

# Fill missing neighbor columns with "None"
neighbor_columns = ['N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10', 'N11', 'N12']
df2[neighbor_columns] = df2[neighbor_columns].fillna("None")

print(df2.isnull().sum())
# save this dataset as a csv
# df2.to_csv("data/clean_county_adjacencies.csv", index=False)

County            0
Population2022    0
FIPS              0
District          0
N1                0
N2                0
N3                0
N4                0
N5                0
N6                0
N7                0
N8                0
N9                0
N10               0
N11               0
N12               0
dtype: int64


In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("data/voting_VA.csv")

print(df.head())
print(df.isnull().sum())

   Unnamed: 0  year     state state_po county_name  county_fips        office  \
0       11161  2000  VIRGINIA       VA    ACCOMACK        51001  US PRESIDENT   
1       11162  2000  VIRGINIA       VA    ACCOMACK        51001  US PRESIDENT   
2       11163  2000  VIRGINIA       VA    ACCOMACK        51001  US PRESIDENT   
3       11164  2000  VIRGINIA       VA    ACCOMACK        51001  US PRESIDENT   
4       11165  2000  VIRGINIA       VA   ALBEMARLE        51003  US PRESIDENT   

        candidate       party  candidatevotes  totalvotes   version   mode  
0         AL GORE    DEMOCRAT            5092       11925  20220315  TOTAL  
1  GEORGE W. BUSH  REPUBLICAN            6352       11925  20220315  TOTAL  
2     RALPH NADER       GREEN             220       11925  20220315  TOTAL  
3           OTHER       OTHER             261       11925  20220315  TOTAL  
4         AL GORE    DEMOCRAT           16255       36846  20220315  TOTAL  
Unnamed: 0        0
year              0
state      

In [2]:
# Remove the 'Unnamed: 0' column
df.drop('Unnamed: 0', axis=1, inplace=True)

# Check unique values for certain columns to ensure consistency
print(df['state'].unique())  # Should only contain "VIRGINIA"
print(df['state_po'].unique())  # Should only contain "VA"
print(df['office'].unique())  # Should only contain "US PRESIDENT"
print(df['mode'].unique())  # Check if any action is needed

['VIRGINIA']
['VA']
['US PRESIDENT']
['TOTAL' 'ABSENTEE' 'ELECTION DAY' 'PROVISIONAL']


In [3]:
df['mode'].head()

0    TOTAL
1    TOTAL
2    TOTAL
3    TOTAL
4    TOTAL
Name: mode, dtype: object

In [4]:
# If we're interested in overall voting results without differentiating between the modes:
# df_total = df[df['mode'] == 'TOTAL']
# df_total.to_csv("data/clean_total_voting_VA.csv", index=False) 

# Otherwise, we can keep all of the modes of voting and use them as features
# in our models
df['mode'] = df['mode'].astype('category')
df['mode'].head()

0    TOTAL
1    TOTAL
2    TOTAL
3    TOTAL
4    TOTAL
Name: mode, dtype: category
Categories (4, object): ['ABSENTEE', 'ELECTION DAY', 'PROVISIONAL', 'TOTAL']

In [5]:
# Provides a dataframe where the modes aggregates all individual records across different voting modes 
# into a single record per candidate per county per election year. 

df_aggregated = df.groupby(['year', 'county_name', 'county_fips', 'candidate', 'party']).agg({
    'candidatevotes': 'sum',
    'totalvotes': 'max'  # Assuming totalvotes is the same across all modes, otherwise sum might be needed
}).reset_index()

df_aggregated.head()

Unnamed: 0,year,county_name,county_fips,candidate,party,candidatevotes,totalvotes
0,2000,ACCOMACK,51001,AL GORE,DEMOCRAT,5092,11925
1,2000,ACCOMACK,51001,GEORGE W. BUSH,REPUBLICAN,6352,11925
2,2000,ACCOMACK,51001,OTHER,OTHER,261,11925
3,2000,ACCOMACK,51001,RALPH NADER,GREEN,220,11925
4,2000,ALBEMARLE,51003,AL GORE,DEMOCRAT,16255,36846


In [6]:
import pandas as pd
#removing candidates column (unimportant)
df_real = df_aggregated.drop('candidate', axis=1)
#removing third parties (also unimportant)
df_real = df_real[~df_real['party'].isin(['OTHER', 'GREEN'])]
#finding net total votes
df_real['net_total'] = df_real.groupby(['year', 'county_name', 'county_fips', 'totalvotes'])['candidatevotes'].transform(lambda x: x.iloc[1] - x.iloc[0])

# merging columns
df_real = df_real.groupby(['year', 'county_name', 'county_fips', 'totalvotes', 'net_total']).agg({'party': ', '.join}).reset_index()
df_real['winner'] = df_real['net_total'].apply(lambda x: 'REPUBLICAN' if x > 0 else 'DEMOCRAT')
#dropping some more columns
df_real = df_real.drop(['party', 'totalvotes'], axis=1)
df_final = df_real[['year', 'county_name', 'county_fips','net_total', 'winner']]
df_final.head()

Unnamed: 0,year,county_name,county_fips,net_total,winner
0,2000,ACCOMACK,51001,1260,REPUBLICAN
1,2000,ALBEMARLE,51003,2036,REPUBLICAN
2,2000,ALEXANDRIA,51510,-14590,DEMOCRAT
3,2000,ALLEGHANY,51005,594,REPUBLICAN
4,2000,AMELIA,51007,1193,REPUBLICAN


In [41]:
df_test = df_final[df_final['year'] == 2008]
df_temp = pd.read_csv("data/county_data/0002_ds191_20125_county_E.csv", low_memory=False)
df_temp.columns = df_temp.iloc[0]
df_temp = df_temp[1:]
df_temp = df_temp.reset_index(drop=True)
df_temp['FIPS'] = df_temp['Public Use Microdata Area Code'].astype(str).apply(lambda x: x.split("US")[-1])
df_test['county_fips'] = df_test['county_fips'].astype(str)
df_merged = pd.merge(df_test, df_temp, left_on='county_fips', right_on='FIPS', how='left')
df_cleaned = df_merged.dropna(axis=1, how='all')

Index([                          1,       'GIS Join Match Code',
                  'Data File Year', 'State Postal Abbreviation',
                     'Region Code',             'Division Code',
                      'State Name',                'State Code',
                     'County Name',               'County Code',
       ...
              'Not in labor force',                     'Total',
                           'Total',                  'Occupied',
                          'Vacant',                     'Total',
                  'Owner occupied',           'Renter occupied',
               'Median gross rent',    'Median value (dollars)'],
      dtype='object', name=0, length=191)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['county_fips'] = df_test['county_fips'].astype(str)


In [53]:
# This is another attempt at getting info from csv files into the dataframe. 
csv_files = ["data/county_data/0002_ds176_20105_county_E.csv", "data/county_data/0002_ds191_20125_county_E.csv", "data/county_data/0002_ds191_20125_county_E.csv", "data/county_data/0002_ds191_20125_county_E.csv", "data/county_data/0002_ds239_20185_county_E.csv", "data/county_data/0002_ds249_20205_county_E.csv"]

# Split df_final by year
df_final_by_year = dict(tuple(df_final.groupby('year')))

# Initialize an empty list to store the updated yearly dataframes
df_updated_list = []

for year, df_year in df_final_by_year.items():
    # Find the appropriate save file
    csv_file = ""
    if year < 2008:
        csv_file = csv_files[0]
    elif year < 2010:
        csv_file = csv_files[1]
    elif year < 2012:
        csv_file = csv_files[2]
    elif year < 2014:
        csv_file = csv_files[3]
    elif year < 2016:
        csv_file = csv_files[4]
    elif year < 2021:
        csv_file = csv_files[5]
    else:
        print(f"No csv file for year {year}")
        continue
    
    if csv_file:
        df_temp = pd.read_csv(csv_file, low_memory=False)
        #rename all of the column names to tbeing the first row, then re-index it
        df_temp.columns = df_temp.iloc[0]
        #find the FIPS code based on the public use microdataarea code
        df_temp['FIPS'] = df_temp['Public Use Microdata Area Code'].astype(str).apply(lambda x: x.split("US")[-1])
        df_temp = df_temp.dropna(axis=1, how='all')
        df_year['county_fips'] = df_year['county_fips'].astype(str)
        df_merged = pd.merge(df_year, df_temp, left_on='county_fips', right_on='FIPS', how='left')
        df_merged = df_merged.reset_index(drop=True)
        df_updated_list.append(df_merged)
    else:
        print("Can't open file")

# Concatenate the updated dataframes
df_updated = pd.concat(df_updated_list, ignore_index=True)
df_updated = df_updated.reset_index(drop=True)
df_updated = df_updated.dropna(axis=1, how='all')
print(df_updated.head())

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [48]:
#decided to drop the 2000 & 2004 data because there is no county info that goes with it. 
df_updated = df_updated[~df_updated['year'].isin([2000, 2004])]
df_updated.head()

Unnamed: 0.1,year,county_name,county_fips,net_total,winner,Unnamed: 0,GISJOIN,YEAR,STUSAB,STATE,...,AMWSE004,AMWSE005,AMWSE006,AMWSE007,AMWSE008,AMWSE009,AMWSE010,AMWSE011,AMWSE012,AMWSE013
268,2008,ACCOMACK,51001,226,REPUBLICAN,2822.0,G5100010,2008-2012,VA,Virginia,...,,,,,,,,,,
269,2008,ALBEMARLE,51003,-9216,DEMOCRAT,2823.0,G5100030,2008-2012,VA,Virginia,...,,,,,,,,,,
270,2008,ALEXANDRIA,51510,-31292,DEMOCRAT,2917.0,G5105100,2008-2012,VA,Virginia,...,,,,,,,,,,
271,2008,ALLEGHANY,51005,162,REPUBLICAN,2824.0,G5100050,2008-2012,VA,Virginia,...,,,,,,,,,,
272,2008,AMELIA,51007,1482,REPUBLICAN,2825.0,G5100070,2008-2012,VA,Virginia,...,,,,,,,,,,


In [None]:
# Neural network time. 