In [1]:
## All data retrieved on 04/11/24:
## https://redistrictingdatahub.org/dataset/south-carolina-block-pl-94171-2020-by-table/
## https://redistrictingdatahub.org/dataset/vest-2020-south-carolina-precinct-and-election-results/
## https://redistrictingdatahub.org/dataset/2022-south-carolina-congressional-districts-approved-plan/

In [2]:
import pandas as pd
import geopandas as gpd
import maup
import time

maup.progress.enabled = True

In [3]:
## This first census file has population, Hispanic and non-Hispanic details.
start_time = time.time()
population_df = gpd.read_file("./sc_pl2020_b/sc_pl2020_p2_b.shp")
end_time = time.time()
print("The time to import sc_pl2020_p2_b.shp is:", (end_time-start_time)/60, "mins")

The time to import sc_pl2020_p2_b.shp is: 2.4074586510658262 mins


In [4]:
## This second census file has voting age population (VAP), Hispanic and non-Hispanic details.
start_time = time.time()
vap_df= gpd.read_file("./sc_pl2020_b/sc_pl2020_p4_b.shp")
end_time = time.time()
print("The time to import sc_pl2020_p4_b.shp is:", (end_time-start_time)/60, "mins")

The time to import sc_pl2020_p4_b.shp is: 2.387296466032664 mins


In [5]:
## The data set below has 2020 presidential election results by precinct
start_time = time.time()
election_df = gpd.read_file("./sc_vest_20/sc_vest_20.shp")
end_time = time.time()
print("The time to import sc_vest_20.shp is:", (end_time-start_time)/60, "mins")

The time to import sc_vest_20.shp is: 0.01868724822998047 mins


In [6]:
## The data set below is a shapefile of the congressional districts
start_time = time.time()
cong_df = gpd.read_file("./sc_cong_adopted_2022/S865_Congress.shp")
end_time = time.time()
print("The time to import Congress.shp is:", (end_time-start_time)/60, "mins")

The time to import Congress.shp is: 0.006427168846130371 mins


In [7]:
print("Dimensions of the congressional file:", cong_df.shape)

Dimensions of the congressional file: (7, 19)


In [8]:
print("Columns of population df:\n", population_df.columns)
print("Columns of vap df:\n", vap_df.columns)
print("Columns of election df:\n", election_df.columns)
print("Columns of congressional df:\n", cong_df.columns)

Columns of population df:
 Index(['GEOID20', 'SUMLEV', 'LOGRECNO', 'GEOID', 'COUNTY', 'P0020001',
       'P0020002', 'P0020003', 'P0020004', 'P0020005', 'P0020006', 'P0020007',
       'P0020008', 'P0020009', 'P0020010', 'P0020011', 'P0020012', 'P0020013',
       'P0020014', 'P0020015', 'P0020016', 'P0020017', 'P0020018', 'P0020019',
       'P0020020', 'P0020021', 'P0020022', 'P0020023', 'P0020024', 'P0020025',
       'P0020026', 'P0020027', 'P0020028', 'P0020029', 'P0020030', 'P0020031',
       'P0020032', 'P0020033', 'P0020034', 'P0020035', 'P0020036', 'P0020037',
       'P0020038', 'P0020039', 'P0020040', 'P0020041', 'P0020042', 'P0020043',
       'P0020044', 'P0020045', 'P0020046', 'P0020047', 'P0020048', 'P0020049',
       'P0020050', 'P0020051', 'P0020052', 'P0020053', 'P0020054', 'P0020055',
       'P0020056', 'P0020057', 'P0020058', 'P0020059', 'P0020060', 'P0020061',
       'P0020062', 'P0020063', 'P0020064', 'P0020065', 'P0020066', 'P0020067',
       'P0020068', 'P0020069', 'P

In [9]:
## We'll use "DISTRICT" as our district column name
district_col_name = "DISTRICT"

In [10]:
# Check the CRS for election dataframe
election_df.crs

<Projected CRS: EPSG:2273>
Name: NAD83 / South Carolina (ft)
Axis Info [cartesian]:
- X[east]: Easting (foot)
- Y[north]: Northing (foot)
Area of Use:
- name: United States (USA) - South Carolina - counties of Abbeville; Aiken; Allendale; Anderson; Bamberg; Barnwell; Beaufort; Berkeley; Calhoun; Charleston; Cherokee; Chester; Chesterfield; Clarendon; Colleton; Darlington; Dillon; Dorchester; Edgefield; Fairfield; Florence; Georgetown; Greenville; Greenwood; Hampton; Horry; Jasper; Kershaw; Lancaster; Laurens; Lee; Lexington; Marion; Marlboro; McCormick; Newberry; Oconee; Orangeburg; Pickens; Richland; Saluda; Spartanburg; Sumter; Union; Williamsburg; York.
- bounds: (-83.36, 32.05, -78.52, 35.21)
Coordinate Operation:
- name: SPCS83 South Carolina zone (International feet)
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [11]:
# Check the CRS for population dataframe
population_df.crs

<Geographic 2D CRS: EPSG:4269>
Name: NAD83
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: North America - onshore and offshore: Canada - Alberta; British Columbia; Manitoba; New Brunswick; Newfoundland and Labrador; Northwest Territories; Nova Scotia; Nunavut; Ontario; Prince Edward Island; Quebec; Saskatchewan; Yukon. Puerto Rico. United States (USA) - Alabama; Alaska; Arizona; Arkansas; California; Colorado; Connecticut; Delaware; Florida; Georgia; Hawaii; Idaho; Illinois; Indiana; Iowa; Kansas; Kentucky; Louisiana; Maine; Maryland; Massachusetts; Michigan; Minnesota; Mississippi; Missouri; Montana; Nebraska; Nevada; New Hampshire; New Jersey; New Mexico; New York; North Carolina; North Dakota; Ohio; Oklahoma; Oregon; Pennsylvania; Rhode Island; South Carolina; South Dakota; Tennessee; Texas; Utah; Vermont; Virginia; Washington; West Virginia; Wisconsin; Wyoming. US Virgin Islands. British Virgin Islands

In [12]:
election_df = election_df.to_crs(election_df.estimate_utm_crs())

In [13]:
population_df = population_df.to_crs(population_df.estimate_utm_crs())

In [14]:
vap_df = vap_df.to_crs(vap_df.estimate_utm_crs())

In [15]:
vap_df.crs

<Projected CRS: EPSG:32617>
Name: WGS 84 / UTM zone 17N
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Between 84°W and 78°W, northern hemisphere between equator and 84°N, onshore and offshore. Bahamas. Ecuador - north of equator. Canada - Nunavut; Ontario; Quebec. Cayman Islands. Colombia. Costa Rica. Cuba. Jamaica. Nicaragua. Panama. United States (USA).
- bounds: (-84.0, 0.0, -78.0, 84.0)
Coordinate Operation:
- name: UTM zone 17N
- method: Transverse Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [16]:
population_df.crs

<Projected CRS: EPSG:32617>
Name: WGS 84 / UTM zone 17N
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Between 84°W and 78°W, northern hemisphere between equator and 84°N, onshore and offshore. Bahamas. Ecuador - north of equator. Canada - Nunavut; Ontario; Quebec. Cayman Islands. Colombia. Costa Rica. Cuba. Jamaica. Nicaragua. Panama. United States (USA).
- bounds: (-84.0, 0.0, -78.0, 84.0)
Coordinate Operation:
- name: UTM zone 17N
- method: Transverse Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [17]:
election_df.crs

<Projected CRS: EPSG:32617>
Name: WGS 84 / UTM zone 17N
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Between 84°W and 78°W, northern hemisphere between equator and 84°N, onshore and offshore. Bahamas. Ecuador - north of equator. Canada - Nunavut; Ontario; Quebec. Cayman Islands. Colombia. Costa Rica. Cuba. Jamaica. Nicaragua. Panama. United States (USA).
- bounds: (-84.0, 0.0, -78.0, 84.0)
Coordinate Operation:
- name: UTM zone 17N
- method: Transverse Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [18]:
## Put data in same geometry units
## Assign blocks to precints
blocks_to_precincts_assignment = maup.assign(population_df.geometry, election_df.geometry)
vap_blocks_to_precincts_assignment = maup.assign(vap_df.geometry, election_df.geometry)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2263/2263 [00:04<00:00, 509.36it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2263/2263 [00:17<00:00, 132.42it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2263/2263 [00:04<00:00, 547.50it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2263/2263 [00:15<00:00, 144.50it/s]


In [19]:
## The columns below are the ones we're interested in.
pop_column_names = ['P0020001', 'P0020002', 'P0020005', 'P0020006', 'P0020007',
                    'P0020008', 'P0020009', 'P0020010', 'P0020011']
vap_column_names = ['P0040001', 'P0040002', 'P0040005', 'P0040006', 'P0040007',
                    'P0040008', 'P0040009', 'P0040010', 'P0040011']

In [20]:
# We'll put all of the population columns into the election dataframe
for name in pop_column_names:
    election_df[name] = population_df[name].groupby(blocks_to_precincts_assignment).sum()
for name in vap_column_names:
    election_df[name] = vap_df[name].groupby(vap_blocks_to_precincts_assignment).sum()

In [21]:
print("Sanity check the population data merged to election dataframe has not been changed:")
print(population_df['P0020001'].sum())
print(election_df['P0020001'].sum())
print(vap_df['P0040001'].sum())
print(election_df['P0040001'].sum())

Sanity check the population data merged to election dataframe has not been changed:
5118425
5118425
4014460
4014460


In [22]:
print("Result after Applying Maup Doctor on the original election df:\n",maup.doctor(election_df))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2263/2263 [00:03<00:00, 712.15it/s]


There are 823 overlaps.
There are 3934 holes.
There are some invalid geometries.
Result after Applying Maup Doctor on the original election df:
 False


In [None]:
print("Now repair the map using Maup Smart Repair.")
repaired_election_df = maup.smart_repair(election_df)

Now repair the map using Maup Smart Repair.


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  geometries_df["geometry"][i] = shapely.wkb.loads(
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original Da

Snapping all geometries to a grid with precision 10^( -5 ) to avoid GEOS errors.


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  pieces_df["polygon indices"][i] = set()


Identifying overlaps...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  pieces_df["polygon indices"][i] = pieces_df["polygon indices"][i].union({j})
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11494/11494 [00:04<00:00, 2589.44it/s]
You are setting values through c

Resolving overlaps...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  geometries_disconnected_df["geometry"][g_ind] = unary_union([
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the

Assigning order 2 pieces...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  geometries_df["geometry"][poly_to_add_to] = unary_union(


Assigning order 3 pieces...
Filling gaps...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  holes_df.geometry[h_ind] = orient(holes_df.geometry[h_ind])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the o

In [None]:
print("Result after Applying Maup Doctor on the original election df:\n",maup.doctor(repaired_election_df))

In [None]:
precincts_to_districts_assignment = maup.assign(repaired_election_df.geometry, cong_df.geometry)
repaired_election_df["CD"] = precincts_to_districts_assignment

In [None]:
print(set(repaired_election_df["CD"]))
for precinct_index in range(len(repaired_election_df)):
    repaired_election_df.at[precinct_index, "CD"] = cong_df.at[repaired_election_df.at[precinct_index, "CD"], district_col_name]
print(set(cong_df[district_col_name]))
print(set(repaired_election_df["CD"]))

In [None]:
rename_dict = {'P0020001': 'TOTPOP', 'P0020002': 'HISP', 'P0020005': 'NH_WHITE', 'P0020006': 'NH_BLACK',
               'P0020007': 'NH_AMIN', 'P0020008': 'NH_ASIAN', 'P0020009': 'NH_NHPI', 'P0020010': 'NH_OTHER',
               'P0020011': 'NH_2MORE', 'P0040001': 'VAP', 'P0040002': 'HVAP', 'P0040005': 'WVAP', 'P0040006': 'BVAP',
               'P0040007': 'AMINVAP', 'P0040008': 'ASIANVAP', 'P0040009': 'NHPIVAP', 'P0040010': 'OTHERVAP',
               'P0040011': '2MOREVAP', 'G20PREDBID': 'G20PRED', 'G20PRERTRU': 'G20PRER', 'G20USSDDUR': 'G20USSD',
               'G20USSRCUR': 'G20USSR'}

In [None]:
repaired_election_df.rename(columns=rename_dict, inplace = True)
print("List of columns after renaming:\n", list(repaired_election_df.columns))

In [None]:
repaired_election_df.drop(columns=[ 'G20PRELJOR','G20PREGHAW','G20PREAFUE','G20USSCBLE', 'G20USSOWRI' ], inplace=True)
print("List of columns after dropping unuseful columns:\n", list(repaired_election_df.columns))

In [None]:
repaired_election_df.plot()

In [None]:
print("Total population in the repaired election df:")
pop_vals = [repaired_election_df.loc[repaired_election_df["CD"] == n, "TOTPOP"].sum() for n in range(1, 10)]
print(pop_vals)

In [None]:
## Save shapefiles
repaired_election_df.to_file("./shapefiles/SC.shp")
shp_file = gpd.read_file('./shapefiles/SC.shp')
shp_file.to_file('./shapefiles/SC.geojson', driver='GeoJSON')