## Cleaning
This file does three things, in this order:
- Cleans the csvs that were created in the data_collection/ folder.
- Cleans three csvs downloaded from the US Department of Health and Human Services website, on income distribution, education, and age in Virginia.
- Continuously merges these new DataFrames into one cohesive DataFrame compiling sociographic and demographic statistics.

In [None]:
import pandas as pd
from IPython.core.display import HTML

va_election_2020_original_df = pd.read_csv('../csv_collection/virginia_2020_election.csv')
va_pop_hist_original_df = pd.read_csv('../csv_collection/county_population_hist_info.csv')

display(va_election_2020_original_df.head(3))
display(va_pop_hist_original_df.head(3))

Unnamed: 0,County/City,Joe Biden Democratic,Joe Biden Democratic.1,Donald Trump Republican,Donald Trump Republican.1,Various candidates Other parties,Various candidates Other parties.1,Margin,Margin.1,Total
0,County/City,#,%,#,%,#,%,#,%,Total
1,Accomack,7578,44.68%,9172,54.07%,212,1.25%,-1594,-9.39%,16962
2,Albemarle,42466,65.68%,20804,32.18%,1387,2.14%,21662,33.50%,64657


Unnamed: 0,County,FIPS code[5],County seat[6][7],Est.[6],Origin,Etymology,Population[8],Area[6],Map
0,Accomack County,1,Accomac,1663,Accomac Shire was established in 1634 as one o...,"From the Native American word Accawmack, meani...",33411,"455 sq mi (1,178 km2)",
1,Albemarle County,3,Charlottesville,1744,"In 1744, the Virginia General Assembly created...","Willem Anne van Keppel, 2nd Earl of Albemarle,...",117313,"723 sq mi (1,873 km2)",
2,Alleghany County,5,Covington,1822,Formed from parts of Bath and Botetourt counti...,Alleghany Mountains,14632,"446 sq mi (1,155 km2)",


In [2]:
# Preserve the original DataFrame:
va_election_2020_df = va_election_2020_original_df

### Cleaning the Virginia History and Population CSV:

In [3]:
display(va_pop_hist_original_df.head(3))

Unnamed: 0,County,FIPS code[5],County seat[6][7],Est.[6],Origin,Etymology,Population[8],Area[6],Map
0,Accomack County,1,Accomac,1663,Accomac Shire was established in 1634 as one o...,"From the Native American word Accawmack, meani...",33411,"455 sq mi (1,178 km2)",
1,Albemarle County,3,Charlottesville,1744,"In 1744, the Virginia General Assembly created...","Willem Anne van Keppel, 2nd Earl of Albemarle,...",117313,"723 sq mi (1,873 km2)",
2,Alleghany County,5,Covington,1822,Formed from parts of Bath and Botetourt counti...,Alleghany Mountains,14632,"446 sq mi (1,155 km2)",


In [None]:
# Changing the Area[6] column to only include a number:

# Create function to clean the column:
def get_area(column_value):
    if isinstance(column_value, str):
        clean_value = column_value.replace("\xa0", " ")
        new_value = clean_value.split(" ")
        return int(new_value[0])
    return None

# Apply the function:
va_pop_hist_original_df['Area[6]'] = va_pop_hist_original_df['Area[6]'].apply(get_area)

display(va_pop_hist_original_df.head(3))

Unnamed: 0,County,FIPS code[5],County seat[6][7],Est.[6],Origin,Etymology,Population[8],Area[6],Map
0,Accomack County,1,Accomac,1663,Accomac Shire was established in 1634 as one o...,"From the Native American word Accawmack, meani...",33411,455,
1,Albemarle County,3,Charlottesville,1744,"In 1744, the Virginia General Assembly created...","Willem Anne van Keppel, 2nd Earl of Albemarle,...",117313,723,
2,Alleghany County,5,Covington,1822,Formed from parts of Bath and Botetourt counti...,Alleghany Mountains,14632,446,


In [5]:
# Renaming a column for accuracy:
va_pop_hist_original_df = va_pop_hist_original_df.rename(columns={
    'County': 'city/county'
})

display(va_pop_hist_original_df.head(3))

Unnamed: 0,city/county,FIPS code[5],County seat[6][7],Est.[6],Origin,Etymology,Population[8],Area[6],Map
0,Accomack County,1,Accomac,1663,Accomac Shire was established in 1634 as one o...,"From the Native American word Accawmack, meani...",33411,455,
1,Albemarle County,3,Charlottesville,1744,"In 1744, the Virginia General Assembly created...","Willem Anne van Keppel, 2nd Earl of Albemarle,...",117313,723,
2,Alleghany County,5,Covington,1822,Formed from parts of Bath and Botetourt counti...,Alleghany Mountains,14632,446,


In [6]:
# Get population density information :
va_pop_hist_original_df['pop_density_sqmi'] = va_pop_hist_original_df['Population[8]'] / va_pop_hist_original_df['Area[6]']

display(va_pop_hist_original_df.head(3))

Unnamed: 0,city/county,FIPS code[5],County seat[6][7],Est.[6],Origin,Etymology,Population[8],Area[6],Map,pop_density_sqmi
0,Accomack County,1,Accomac,1663,Accomac Shire was established in 1634 as one o...,"From the Native American word Accawmack, meani...",33411,455,,73.430769
1,Albemarle County,3,Charlottesville,1744,"In 1744, the Virginia General Assembly created...","Willem Anne van Keppel, 2nd Earl of Albemarle,...",117313,723,,162.258645
2,Alleghany County,5,Covington,1822,Formed from parts of Bath and Botetourt counti...,Alleghany Mountains,14632,446,,32.807175


### Cleaning the va_election_2020_df

In [7]:
# Renaming columns to snake_case:
va_election_2020_original_df.rename(columns={
    'County/City': 'city/county',
    'Joe Biden Democratic': 'biden_votes',
    'Joe Biden Democratic.1': 'biden_%',
    'Donald Trump Republican': 'trump_votes',
    'Donald Trump Republican.1': 'trump_%',
    'Various candidates Other parties': 'other_party_votes',
    'Various candidates Other parties.1': 'other_party_%',
    'Margin': 'margin_votes',
    'Margin.1': 'margin_%',
    'Total': 'total_votes'
}, inplace=True)

va_election_2020_df = va_election_2020_original_df

In [8]:
display(va_election_2020_df.head(2))

Unnamed: 0,city/county,biden_votes,biden_%,trump_votes,trump_%,other_party_votes,other_party_%,margin_votes,margin_%,total_votes
0,County/City,#,%,#,%,#,%,#,%,Total
1,Accomack,7578,44.68%,9172,54.07%,212,1.25%,-1594,-9.39%,16962


In [None]:
# This cell cleans the va_election_2020_df.

# Remove the needless first row:
va_election_2020_df = va_election_2020_df[1:]

# Create a function to remove the % sign form the rows and apply it:
def clean_percent(columns_value):
    cleaned_value = columns_value.split('%')
    my_value = cleaned_value[0]
    return my_value

va_election_2020_df['biden_%'] = va_election_2020_df['biden_%'].apply(clean_percent)
va_election_2020_df['trump_%'] = va_election_2020_df['trump_%'].apply(clean_percent)
va_election_2020_df['other_party_%'] = va_election_2020_df['other_party_%'].apply(clean_percent)
va_election_2020_df['margin_%'] = va_election_2020_df['margin_%'].apply(clean_percent)

# Turn all these columns to numbers:
va_election_2020_df['biden_%'] = pd.to_numeric(va_election_2020_df['biden_%'], errors='coerce')
va_election_2020_df['trump_%'] = pd.to_numeric(va_election_2020_df['trump_%'], errors='coerce')
va_election_2020_df['other_party_%'] = pd.to_numeric(va_election_2020_df['other_party_%'], errors='coerce')
va_election_2020_df['margin_%'] = pd.to_numeric(va_election_2020_df['margin_%'], errors='coerce')
va_election_2020_df['biden_votes'] = pd.to_numeric(va_election_2020_df['biden_votes'], errors='coerce')
va_election_2020_df['trump_votes'] = pd.to_numeric(va_election_2020_df['trump_votes'], errors='coerce')
va_election_2020_df['other_party_votes'] = pd.to_numeric(va_election_2020_df['other_party_votes'], errors='coerce')
va_election_2020_df['margin_votes'] = pd.to_numeric(va_election_2020_df['margin_votes'], errors='coerce')
va_election_2020_df['total_votes'] = pd.to_numeric(va_election_2020_df['total_votes'], errors='coerce')


In [10]:
display(va_election_2020_df.head(3))

Unnamed: 0,city/county,biden_votes,biden_%,trump_votes,trump_%,other_party_votes,other_party_%,margin_votes,margin_%,total_votes
1,Accomack,7578,44.68,9172,54.07,212,1.25,-1594,-9.39,16962
2,Albemarle,42466,65.68,20804,32.18,1387,2.14,21662,33.5,64657
3,Alexandria,66240,80.28,14544,17.63,1724,2.09,51696,62.65,82508


There is an inconsitency with how counties and cities are named in different csvs. So I need to standard this, by adding the word County or City to the locality.

In [None]:
# This cell will map the independent cities to their row, if the string is in the value.

# List of independent cities in Virginia (taken from an image, and generated with ChatGPT):
independent_cities = {
    "Alexandria", "Bristol", "Buena Vista", "Charlottesville", "Chesapeake",
    "Colonial Heights", "Covington", "Danville", "Emporia", "Fairfax",
    "Falls Church", "Franklin", "Fredericksburg", "Galax", "Hampton",
    "Harrisonburg", "Hopewell", "Lexington", "Lynchburg", "Manassas",
    "Manassas Park", "Martinsville", "Newport News", "Norfolk", "Norton",
    "Petersburg", "Poquoson", "Portsmouth", "Radford", "Richmond",
    "Roanoke", "Salem", "Staunton", "Suffolk", "Virginia Beach",
    "Waynesboro", "Williamsburg", "Winchester"
}

# This function will add the correct ending to each locality:
def standardize_locality_name(locality):
    locality = locality.strip()

    if locality in independent_cities and not locality.endswith("City"):
        return locality + " City"
    elif locality not in independent_cities and not locality.endswith("County"):
        return locality + " County"

    return locality

# Apply function to the DataFrame
va_election_2020_df['city/county'] = va_election_2020_df['city/county'].apply(standardize_locality_name)

# Display results
display(va_election_2020_df.head(3))
print(va_election_2020_df.info())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  va_election_2020_df['city/county'] = va_election_2020_df['city/county'].apply(standardize_locality_name)


Unnamed: 0,city/county,biden_votes,biden_%,trump_votes,trump_%,other_party_votes,other_party_%,margin_votes,margin_%,total_votes
1,Accomack County,7578,44.68,9172,54.07,212,1.25,-1594,-9.39,16962
2,Albemarle County,42466,65.68,20804,32.18,1387,2.14,21662,33.5,64657
3,Alexandria City,66240,80.28,14544,17.63,1724,2.09,51696,62.65,82508


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 1 to 134
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   city/county        134 non-null    object 
 1   biden_votes        134 non-null    int64  
 2   biden_%            134 non-null    float64
 3   trump_votes        134 non-null    int64  
 4   trump_%            134 non-null    float64
 5   other_party_votes  134 non-null    int64  
 6   other_party_%      134 non-null    float64
 7   margin_votes       134 non-null    int64  
 8   margin_%           134 non-null    float64
 9   total_votes        134 non-null    int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 10.6+ KB
None


### Merging dataframes to create one large one with the pertinent information

In [12]:
# Left merging on the va_elections_2020_df, as it has the most important info. And bringing in the population history DataFrame.
columns_to_keep = ['city/county', 'Population[8]', 'Area[6]', 'pop_density_sqmi']

merged_df = pd.merge(
    va_election_2020_df,
    va_pop_hist_original_df[columns_to_keep],
    how='left',
    on='city/county'
)

In [13]:
merged_df = merged_df.rename(columns={
    'Population[8]': 'population',
    'Area[6]': 'area'
})

In [14]:
# Creating a red/blue column based on 2020 voting outcomes:
def affiliation(row):
    if row['biden_votes'] > row['trump_votes']:
        return 'blue'
    else:
        return 'red'

merged_df['affiliation_2020'] = merged_df.apply(affiliation, axis=1)

display(merged_df.head(3))

Unnamed: 0,city/county,biden_votes,biden_%,trump_votes,trump_%,other_party_votes,other_party_%,margin_votes,margin_%,total_votes,population,area,pop_density_sqmi,affiliation_2020
0,Accomack County,7578,44.68,9172,54.07,212,1.25,-1594,-9.39,16962,33411.0,455.0,73.430769,red
1,Albemarle County,42466,65.68,20804,32.18,1387,2.14,21662,33.5,64657,117313.0,723.0,162.258645,blue
2,Alexandria City,66240,80.28,14544,17.63,1724,2.09,51696,62.65,82508,,,,blue


### Cleaning the income distribution csv

In [None]:
income_df = pd.read_csv('../csv_collection/HDPulse_data_export.csv')
display(income_df.head(3))

Unnamed: 0,County,FIPS,Value (Dollars),Rank within US (of 3141 counties)
0,United States,0,78538,
1,Virginia,51000,90974,12 of 52
2,Norton City,51720,38497,3071


In [None]:
# income_df = pd.read_csv('../csv_collection/HDPulse_data_export.csv')

income_df = income_df[:-4]

def remove_comma(columns_value):
    new_value = columns_value.split(',')
    my_value = new_value[0] + new_value[1]
    return my_value

income_df['Value (Dollars)'] = income_df['Value (Dollars)'].apply(remove_comma)

income_df['Value (Dollars)'] = pd.to_numeric(income_df['Value (Dollars)'], errors='coerce')


display(income_df.head(3))
income_df.info()

Unnamed: 0,County,FIPS,Value (Dollars),Rank within US (of 3141 counties)
0,United States,0,78538,
1,Virginia,51000,90974,12 of 52
2,Norton City,51720,38497,3071


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 4 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   County                             135 non-null    object
 1   FIPS                               135 non-null    int64 
 2   Value (Dollars)                    135 non-null    int64 
 3   Rank within US (of 3141 counties)  134 non-null    object
dtypes: int64(2), object(2)
memory usage: 4.3+ KB


In [17]:
income_df = income_df.rename(columns={
    'County': 'city/county',
})

In [None]:
# remember that value dollars is median family income
merged_df_2 = pd.merge(
    merged_df,
    income_df[['city/county', 'Value (Dollars)']],
    on='city/county',
    how='left'
)
display(merged_df_2.head(3))

Unnamed: 0,city/county,biden_votes,biden_%,trump_votes,trump_%,other_party_votes,other_party_%,margin_votes,margin_%,total_votes,population,area,pop_density_sqmi,affiliation_2020,Value (Dollars)
0,Accomack County,7578,44.68,9172,54.07,212,1.25,-1594,-9.39,16962,33411.0,455.0,73.430769,red,57500.0
1,Albemarle County,42466,65.68,20804,32.18,1387,2.14,21662,33.5,64657,117313.0,723.0,162.258645,blue,102617.0
2,Alexandria City,66240,80.28,14544,17.63,1724,2.09,51696,62.65,82508,,,,blue,113638.0


### Cleaning the education DataFrame

In [19]:
education_df = pd.read_csv('../csv_collection/HDPulse_data_education.csv')
display(education_df.head(3))

Unnamed: 0,County,Value (Percent),People (Education: At Least Bachelor's Degree)
0,United States,35.0,79954302
1,Virginia,41.5,2471630
2,Covington City,9.6,367


In [None]:
education_df = pd.read_csv('../csv_collection/HDPulse_data_education.csv')

education_df = education_df.rename(columns={
    'County': 'city/county'
})

display(education_df.head(3))
print(education_df['city/county'].value_counts().head(3))

Unnamed: 0,city/county,Value (Percent),People (Education: At Least Bachelor's Degree)
0,United States,35.0,79954302
1,Virginia,41.5,2471630
2,Covington City,9.6,367


city/county
United States      1
Salem City         1
Highland County    1
Name: count, dtype: int64


In [21]:
# remember value_percent is percent with a bacholers or higher
merged_df_3 = pd.merge(
    merged_df_2,
    education_df[['city/county', 'Value (Percent)']],
    on='city/county',
    how='left'
)

display(merged_df_3.head(3))

Unnamed: 0,city/county,biden_votes,biden_%,trump_votes,trump_%,other_party_votes,other_party_%,margin_votes,margin_%,total_votes,population,area,pop_density_sqmi,affiliation_2020,Value (Dollars),Value (Percent)
0,Accomack County,7578,44.68,9172,54.07,212,1.25,-1594,-9.39,16962,33411.0,455.0,73.430769,red,57500.0,21.8
1,Albemarle County,42466,65.68,20804,32.18,1387,2.14,21662,33.5,64657,117313.0,723.0,162.258645,blue,102617.0,60.6
2,Alexandria City,66240,80.28,14544,17.63,1724,2.09,51696,62.65,82508,,,,blue,113638.0,65.8


In [22]:
print(merged_df_3['city/county'].value_counts().head(10))

city/county
Accomack County         1
Newport News City       1
Prince Edward County    1
Powhatan County         1
Portsmouth City         1
Poquoson City           1
Pittsylvania County     1
Petersburg City         1
Patrick County          1
Page County             1
Name: count, dtype: int64


### Cleaning the age DataFrame

In [23]:
age_df = pd.read_csv('../csv_collection/HDPulse_data_ages.csv')
display(age_df.head(3))

Unnamed: 0,County,Value (Percent)
0,Virginia,35.7
1,United States,35.9
2,Radford City,21.8


In [None]:
age_df = age_df.rename(columns={
    'County': 'city/county',
    'Value (Percent)': 'percent_over_50'
})
display(age_df.head(3))
print(age_df['city/county'].value_counts())

Unnamed: 0,city/county,percent_over_50
0,Virginia,35.7
1,United States,35.9
2,Radford City,21.8


city/county
Virginia                   1
Lee County                 1
Bedford City and County    1
Pulaski County             1
Russell County             1
                          ..
Martinsville City          1
Culpeper County            1
Roanoke City               1
Henrico County             1
Lancaster County           1
Name: count, Length: 135, dtype: int64


In [25]:
# Bring in the age info:
merged_df_4 = pd.merge(
    merged_df_3,
    age_df[['city/county', 'percent_over_50']],
    on='city/county',
    how='left'
)
display(merged_df_4.head(3))


Unnamed: 0,city/county,biden_votes,biden_%,trump_votes,trump_%,other_party_votes,other_party_%,margin_votes,margin_%,total_votes,population,area,pop_density_sqmi,affiliation_2020,Value (Dollars),Value (Percent),percent_over_50
0,Accomack County,7578,44.68,9172,54.07,212,1.25,-1594,-9.39,16962,33411.0,455.0,73.430769,red,57500.0,21.8,47.3
1,Albemarle County,42466,65.68,20804,32.18,1387,2.14,21662,33.5,64657,117313.0,723.0,162.258645,blue,102617.0,60.6,38.3
2,Alexandria City,66240,80.28,14544,17.63,1724,2.09,51696,62.65,82508,,,,blue,113638.0,65.8,30.2


In [26]:
merged_df_4 = merged_df_4.rename(columns={
    'Value (Dollars)': 'median_household_income',
    'Value (Percent)': 'bachelors_or_over_%',
    'percent_over_50': 'age_over_50_%'
})
print(merged_df_4['city/county'].value_counts())

display(merged_df_4.head(3))

city/county
Accomack County         1
Newport News City       1
Prince Edward County    1
Powhatan County         1
Portsmouth City         1
                       ..
Falls Church City       1
Fairfax City County     1
Fairfax City            1
Essex County            1
Totals County           1
Name: count, Length: 134, dtype: int64


Unnamed: 0,city/county,biden_votes,biden_%,trump_votes,trump_%,other_party_votes,other_party_%,margin_votes,margin_%,total_votes,population,area,pop_density_sqmi,affiliation_2020,median_household_income,bachelors_or_over_%,age_over_50_%
0,Accomack County,7578,44.68,9172,54.07,212,1.25,-1594,-9.39,16962,33411.0,455.0,73.430769,red,57500.0,21.8,47.3
1,Albemarle County,42466,65.68,20804,32.18,1387,2.14,21662,33.5,64657,117313.0,723.0,162.258645,blue,102617.0,60.6,38.3
2,Alexandria City,66240,80.28,14544,17.63,1724,2.09,51696,62.65,82508,,,,blue,113638.0,65.8,30.2


In [27]:
# Ensuring the final DataFrame is what I need it to be:
print(merged_df_4.info())
display(HTML(merged_df_4.to_html()))
print(merged_df_4['city/county'].value_counts().head(3))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   city/county              134 non-null    object 
 1   biden_votes              134 non-null    int64  
 2   biden_%                  134 non-null    float64
 3   trump_votes              134 non-null    int64  
 4   trump_%                  134 non-null    float64
 5   other_party_votes        134 non-null    int64  
 6   other_party_%            134 non-null    float64
 7   margin_votes             134 non-null    int64  
 8   margin_%                 134 non-null    float64
 9   total_votes              134 non-null    int64  
 10  population               91 non-null     float64
 11  area                     91 non-null     float64
 12  pop_density_sqmi         91 non-null     float64
 13  affiliation_2020         134 non-null    object 
 14  median_household_income  1

Unnamed: 0,city/county,biden_votes,biden_%,trump_votes,trump_%,other_party_votes,other_party_%,margin_votes,margin_%,total_votes,population,area,pop_density_sqmi,affiliation_2020,median_household_income,bachelors_or_over_%,age_over_50_%
0,Accomack County,7578,44.68,9172,54.07,212,1.25,-1594,-9.39,16962,33411.0,455.0,73.430769,red,57500.0,21.8,47.3
1,Albemarle County,42466,65.68,20804,32.18,1387,2.14,21662,33.5,64657,117313.0,723.0,162.258645,blue,102617.0,60.6,38.3
2,Alexandria City,66240,80.28,14544,17.63,1724,2.09,51696,62.65,82508,,,,blue,113638.0,65.8,30.2
3,Alleghany County,2243,27.34,5859,71.43,101,1.23,-3616,-44.09,8203,14632.0,446.0,32.807175,red,,,
4,Amelia County,2411,30.55,5390,68.29,92,1.16,-2979,-37.74,7893,13716.0,357.0,38.420168,red,66339.0,16.8,43.4
5,Amherst County,5672,33.35,11041,64.93,292,1.72,-5369,-31.58,17005,31650.0,475.0,66.631579,red,67298.0,23.0,42.9
6,Appomattox County,2418,26.09,6702,72.31,148,1.6,-4284,-46.22,9268,16957.0,334.0,50.769461,red,62337.0,20.5,41.3
7,Arlington County,105344,80.6,22318,17.08,3037,2.32,83026,63.52,130699,239807.0,26.0,9223.346154,blue,140160.0,77.1,27.8
8,Augusta County,10840,25.64,30714,72.65,724,1.71,-19874,-47.01,42278,78622.0,971.0,80.970134,red,79972.0,23.3,43.9
9,Bath County,646,25.83,1834,73.33,21,0.84,-1188,-47.5,2501,4071.0,532.0,7.652256,red,61709.0,15.6,52.8


city/county
Accomack County         1
Newport News City       1
Prince Edward County    1
Name: count, dtype: int64


In [28]:
# Looks good, adding it as a csv:
merged_df_4.to_csv('../csv_collection/cleaned_counties.csv', index=False)