In [3]:
import geopandas as gpd
from classes import SubdivisionMeasureEDA

In [4]:
# ----------------------------
# Load Data
# ----------------------------
ABT = gpd.read_file("../../../Data/Final_dataset/ABT/ABT.gpkg", layer="subdivisions")
building_footprint_dataset = gpd.read_file("../../../../Erfan Dissertation/Data/Original_dataset/Archive/Mecklenburg_2023_Buildings_sddncgov/Mecklenburg_2023_Buildings.gdb",layer='S_BUILDING_FP').to_crs(ABT.crs)
meck_bo = gpd.read_file("../../../Data/Original_dataset/Archive/mecklenburgcounty_boundary/MecklenburgCounty_Boundary.shp").to_crs(ABT.crs)

ABT["area_acre"] = ABT.geometry.area / 43560
ABT = ABT[(ABT['year'] >= 1990) & (ABT['year'] <= 2023)]

# __Analytical Base Table (ABT)__

In [5]:
ABT

Unnamed: 0,subd_id,name,type,unit_cnt,created_us,created_da,last_edite,last_edi_1,shape_star,shape_stle,...,Shape_Le_2,year,Shape_Le_3,year_1990,Shape_Length,Shape_Area,ci_intra,HAC_dist,geometry,area_acre
0,4294,UNIVERSITY HEIGHTS,TH,342.00,ISLEYKJ,2012-09-17,ISLEYKJ,2012-09-17,1852666.01,6386.82,...,6386.86,1998.00,6386.86,1.00,6386.86,1852661.34,1866.62,10285.53,"MULTIPOLYGON (((1489390.02 570677.77, 1489392....",42.53
2,2,COVENTRY II TOWNHOMES,TH,94.00,ISLEYKJ,2012-09-19,ISLEYKJ,2012-09-19,570947.59,4934.50,...,4934.50,2007.00,4934.50,1.00,4934.50,570947.59,6173.18,19831.48,"MULTIPOLYGON (((1496948.622 569511.625, 149695...",13.11
3,3,WINDING CREEK AT BACK CREEK,SFR,0.00,ISLEYKJ,2008-10-10,ISLEYKJ,2022-03-04,1197621.64,4635.78,...,4635.78,1994.00,4635.78,1.00,4635.78,1197621.64,11846.16,14585.06,"MULTIPOLYGON (((1492292.372 568419.125, 149229...",27.49
4,4,LAKEVIEW II AT FAIRES FARM,SFR,0.00,ISLEYKJ,2008-10-10,ISLEYKJ,2008-10-10,162942.32,1837.95,...,1837.95,2000.00,1837.95,1.00,1837.95,162942.32,29307.43,10375.91,"MULTIPOLYGON (((1486795.747 567973.375, 148682...",3.74
6,8,VILLAGE OF FAIRES FARM,SFR,0.00,ISLEYKJ,2008-10-10,ISLEYKJ,2008-10-10,477775.00,2906.94,...,2906.94,1997.00,2906.94,1.00,2906.94,477775.00,9396.62,9594.15,"MULTIPOLYGON (((1484953.247 566304.375, 148498...",10.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8498,10172,,,,,,,,,,...,,2011.00,,,1352.22,112496.51,1755044.12,3318.87,"MULTIPOLYGON (((1457290.434 540118.074, 145723...",2.58
8499,10172,,,,,,,,,,...,,2011.00,,,1352.22,112496.51,0.00,3318.87,"MULTIPOLYGON (((1457290.434 540118.074, 145723...",2.58
8500,10174,,,,,,,,,,...,,2010.00,,,1456.03,130159.22,207692.58,960.39,"MULTIPOLYGON (((1439870.243 621518.626, 143982...",2.99
8501,10175,,,,,,,,,,...,,2000.00,,,1577.79,94325.29,0.00,920.78,"MULTIPOLYGON (((1439870.243 621518.626, 144023...",2.17


# __General EDA__

In [None]:
#Frequency of subdivisions per year
year_counts = ABT.groupby("year").size().reset_index(name="count").sort_values("year") # Group and sort by year
year_counts["pct_change"] = year_counts["count"].pct_change() * 100
fig = px.bar(year_counts, x="year", y="count", title="Frequency of subdivisions per year", labels={"year": "Year", "count": "Frequency"}, text="count") # Bar chart
fig.add_trace(go.Scatter(x=year_counts["year"], y=year_counts["count"], mode="lines+markers",name="Trend")) # Add line trace
fig.update_traces(textposition="outside", selector=dict(type="bar")) # Adjust text placement
fig.add_vrect(x0=2008, x1=2010, fillcolor="red", opacity=0.1, annotation_text="Recession", line_width=0)
fig.show()

#Year-over-year change
fig = px.bar(year_counts, x="year", y="pct_change", title="Year-over-Year Change (%)",
             labels={"pct_change":"% Change"}, text_auto=".1f")
fig.update_traces(textposition="outside");fig.show()

#Cumulative chart
year_counts["cumulative"] = year_counts["count"].cumsum()
fig = px.line(year_counts, x="year", y="cumulative", markers=True, title="Cumulative Growth of Subdivisions ") # Line chart of cumulative growth
fig.update_traces(line=dict(width=3)) # Style tweaks
fig.update_layout(xaxis_title="Year", yaxis_title="Cumulative Subdivisions", title_x=0.5);fig.show()

# Total and average subdivision area by year
area_stats = (ABT.groupby("year")["area_acre"].agg(total_area="sum", avg_area="mean", median_area="median").reset_index()    .sort_values("year"))
year_counts = year_counts.merge(area_stats, on="year") # Merge with year_counts (for combined view)
fig = px.bar(year_counts,x="year", y="total_area", title="Total Subdivided Area per Year", labels={"year": "Year", "total_area": "Total Area (acre)"}, text_auto=".2s")
fig.update_traces(textposition="outside");fig.show()

# Area-Weighted Intensity (Normalization)
year_counts["avg_area_per_subdivision"] = year_counts["total_area"] / year_counts["count"]
fig = px.line(year_counts, x="year", y="avg_area_per_subdivision", markers=True, title="Average Subdivision Area per Year",
    labels={"avg_area_per_subdivision": "Avg Area (acre)", "year": "Year"})
fig.update_traces(line=dict(width=3)); fig.update_layout(title_x=0.5);fig.show()

#Cumulative Growth of Subdivided Area
year_counts["cumulative_area"] = year_counts["total_area"].cumsum()
fig = px.line(year_counts, x="year", y="cumulative_area", markers=True, title="Cumulative Growth of Subdivided Area",
    labels={"year": "Year", "cumulative_area": "Cumulative Area (acre)"})
fig.update_traces(line=dict(width=3)); fig.update_layout(title_x=0.5);fig.show()

# Folium visualization

ABT_vis = ABT.to_crs(epsg=4326)

# ----------------------------
# Create county grid (same as OverlapMap)
grid_size_miles = 1
county_gdf = meck_bo  # your county boundary

grid_size = grid_size_miles * 5280  # miles → feet
minx, miny, maxx, maxy = county_gdf.total_bounds

grid_polygons = []
grid_ids = []

n_rows = int(math.ceil((maxy - miny) / grid_size))
n_cols = int(math.ceil((maxx - minx) / grid_size))

grid_counter = 1
for row in range(n_rows):
    y0 = miny + row * grid_size
    y1 = y0 + grid_size
    for col in range(n_cols):
        x0 = minx + col * grid_size
        x1 = x0 + grid_size
        grid_polygons.append(box(x0, y0, x1, y1))
        grid_ids.append(grid_counter)
        grid_counter += 1

grid_gdf = gpd.GeoDataFrame({'grid_id': grid_ids, 'geometry': grid_polygons}, crs=county_gdf.crs)
grid_gdf = gpd.clip(grid_gdf, county_gdf).reset_index(drop=True)
grid_gdf = grid_gdf.to_crs(epsg=4326)

# ----------------------------
# MAP 1: Color by YEAR (light → dark green)
year_min, year_max = ABT_vis['year'].min(), ABT_vis['year'].max()
colormap_year = cm.LinearColormap(
    colors=['#d9f0a3', '#addd8e', '#78c679', '#31a354', '#006837'],
    vmin=year_min, vmax=year_max
)
colormap_year.caption = "Subdivision Year (Light → Dark Green)"

map_year = folium.Map(location=[35.2265, -80.8409], zoom_start=11, tiles="CartoDB positron")

# --- Add grid
folium.GeoJson(
    grid_gdf.to_json(),
    name="County Grid",
    style_function=lambda f: {
        'fillColor': 'white',
        'color': 'gray',
        'weight': 1,
        'dashArray': '5,5',
        'fillOpacity': 0.01
    },
    tooltip=folium.GeoJsonTooltip(fields=['grid_id'], aliases=['Grid ID:'], sticky=True)
).add_to(map_year)

# --- Add subdivisions
layer_year = folium.GeoJson(
    ABT_vis[['subd_id','year','geometry']].to_json(),
    name="Subdivisions by Year",
    style_function=lambda feature: {
        'fillColor': colormap_year(feature['properties']['year']),
        'color': colormap_year(feature['properties']['year']),
        'weight': 0.5,
        'fillOpacity': 0.6
    },
    highlight_function=lambda f: {"color": "black", "weight": 3, "fillOpacity": 0.8},
    tooltip=folium.GeoJsonTooltip(fields=['subd_id','year'], labels=True, sticky=True)
)
layer_year.add_to(map_year)
Search(layer=layer_year, search_label="subd_id", placeholder="Search by Sub ID", collapsed=False).add_to(map_year)
colormap_year.add_to(map_year)

# ----------------------------
# MAP 2: Color by LAND AREA (YlOrRd)
area_min, area_max = ABT_vis['area_acre'].min(), ABT_vis['area_acre'].max()
colormap_area = cm.linear.YlOrRd_09.scale(area_min, area_max)
colormap_area.caption = "Subdivision Land Area (sq meters)"

map_area = folium.Map(location=[35.2265, -80.8409], zoom_start=11, tiles="CartoDB positron")

# --- Add grid
folium.GeoJson(
    grid_gdf.to_json(),
    name="County Grid",
    style_function=lambda f: {
        'fillColor': 'white',
        'color': 'gray',
        'weight': 1,
        'dashArray': '5,5',
        'fillOpacity': 0.01
    },
    tooltip=folium.GeoJsonTooltip(fields=['grid_id'], aliases=['Grid ID:'], sticky=True)
).add_to(map_area)

# --- Add subdivisions
layer_area = folium.GeoJson(
    ABT_vis[['subd_id','area_acre','geometry']].to_json(),
    name="Subdivisions by Area",
    style_function=lambda feature: {
        'fillColor': colormap_area(feature['properties']['area_acre']),
        'color': colormap_area(feature['properties']['area_acre']),
        'weight': 0.5,
        'fillOpacity': 0.6
    },
    highlight_function=lambda f: {"color": "black", "weight": 3, "fillOpacity": 0.8},
    tooltip=folium.GeoJsonTooltip(fields=['subd_id','area_acre'], labels=True, sticky=True)
)
layer_area.add_to(map_area)
Search(layer=layer_area, search_label="subd_id", placeholder="Search by Sub ID", collapsed=False).add_to(map_area)
colormap_area.add_to(map_area)

# ----------------------------
# Display maps
display(map_year)
display(map_area)

## __EDA of the 1st measure (ci_intra)__

In [None]:
# ----------------------------
# EDA of ci_intra
# ----------------------------
measure_ci_intra = SubdivisionMeasureEDA(ABT, "ci_intra")
measure_ci_intra.summary_stats()
outliers = measure_ci_intra.detect_outliers()
yearly_stats = measure_ci_intra.yearly_trends()

# Plots
measure_ci_intra.plot_distribution()

# Folium map
map_ci = measure_ci_intra.folium_map(); display(map_ci)

### __Handling outliers__


In [None]:
measure_ci_intra.detect_outliers()
measure_ci_intra.plot_outlier_trends()
map_ = measure_ci_intra.folium_outlier_map(building_footprint_dataset=building_footprint_dataset)
map_

**Reasons behind outliers:**: Very large buildings, buildings very far apart
**Solution:**: Log transform? ignore inclusion of building size? (not area-weighted compactness)


### __Handling missing values__
In the case of Cohesion Index, the missing value means there is no building footprint in the subdivision.
**Note:** I think this measure is not reliable for the recent years because building constructions might not be fully developed.



In [None]:
measure_ci_intra.missing_analysis()

### __EDA after removing missing values and log-transform the data__

In [None]:
measure_ci_intra.plot_distribution(log_transform=True)
measure_ci_intra.plot_kde()

## __EDA of the 2nd measure (HAC_dist)__


In [None]:
# ----------------------------
# EDA of HAC_dist
# ----------------------------
measure_hac_dist = SubdivisionMeasureEDA(ABT, "HAC_dist")
measure_hac_dist.summary_stats()
outliers = measure_hac_dist.detect_outliers()
yearly_stats = measure_hac_dist.yearly_trends()

# Plots
measure_hac_dist.plot_distribution()

# Folium map
map_hac_dist = measure_hac_dist.folium_map(); display(map_hac_dist)

### __Handling outliers__


In [None]:
measure_hac_dist.detect_outliers()
measure_hac_dist.plot_outlier_trends()
map_ = measure_hac_dist.folium_outlier_map(building_footprint_dataset=building_footprint_dataset)
map_

### __EDA after removing missing values and log-transform the data__


In [None]:
measure_hac_dist.plot_distribution(log_transform=True)
measure_hac_dist.plot_kde()

## __EDA of the 3rd measure (ci_buffer)__

In [None]:
# ----------------------------
# EDA of ci_buffer 0.25 mile
# ----------------------------
measure_ci_buffer_025 = SubdivisionMeasureEDA(ABT, "ci_025mi")
measure_ci_buffer_025.summary_stats()
outliers = measure_ci_buffer_025.detect_outliers()
yearly_stats = measure_ci_buffer_025.yearly_trends()

# Plots
measure_ci_buffer_025.plot_distribution()

# Folium map
map_hac_dist = measure_ci_buffer_025.folium_map(); display(map_hac_dist)

In [None]:
!jupyter nbconvert --to html --no-input EDA_copy.ipynb --output ../../../output/Notebook_Outputs/EDA.html