In [32]:
import warnings,subprocess, os, pandas as pd, geopandas as gpd, numpy as np, seaborn as sns, matplotlib.pyplot as plt, folium
from IPython.display import display
from run_analysis import buffer_list_mile, ctrl_vars
from classes import factor_loading_matrices
%matplotlib inline
np.set_printoptions(suppress=True)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
pd.set_option("display.precision", 2)
pd.set_option("display.float_format", "{:.2f}".format)
nb = os.path.basename(__file__) if "__file__" in globals() else "spatial_analysis.ipynb"
subprocess.run(["jupyter", "trust", nb])

CompletedProcess(args=['jupyter', 'trust', 'spatial_analysis.ipynb'], returncode=0)

**User input.**


In [None]:
pca_factors_network = 4
pca_factors_ctrl = 9 

**Loads spatial or tabular datasets into memory.**


In [50]:
gdf = gpd.read_file('../data/neighborhoods/Final_dataset.shp')

In [None]:
def generate_street_network_columns(buffer_list_mile):
    # Prefixes for all network metrics
    metrics = ["int_den", "st_den", "nd_deg", "cl_cof", "shtpth", "bt_cnt", "cl_cnt", "pr_cnt"]

    # Same naming rule used in StreetNetworkProcessor
    def bufname(b):
        return str(b).replace(".", "") if (b < 1 and b != 0.1) else str(int(b * 10))

    # Generate all columns
    return [f"{m}{bufname(b)}" for b in buffer_list_mile for m in metrics]

In [None]:
final_list_net = generate_street_network_columns(buffer_list_mile)

**Generates visualization plots for diagnostics or distributions**


In [None]:
popup_vars = ["id"] + final_list_net + ctrl_vars
gdf_vis = gdf.to_crs(4326)

def fmt(v):
    return f"{v:.2f}" if isinstance(v, (float, int)) and pd.notna(v) else v

# Build FeatureCollection
features = []
for _, r in gdf_vis.iterrows():
    popup_html = "<br>".join([f"{v}: {fmt(r.get(v, 'NA'))}" for v in popup_vars])
    features.append({
        "type": "Feature",
        "geometry": r.geometry.__geo_interface__,
        "properties": {"id": r["id"], "popup": popup_html}
    })

geojson = {"type": "FeatureCollection", "features": features}

# Create map
m = folium.Map(
    [gdf_vis.geometry.centroid.y.mean(), gdf_vis.geometry.centroid.x.mean()],
    zoom_start=11, tiles="CartoDB positron"
)

folium.GeoJson(
    geojson,
    tooltip=folium.GeoJsonTooltip(fields=["id"], aliases=["ID:"]),
    popup=folium.GeoJsonPopup(fields=["popup"], labels=False, max_width=350)
).add_to(m)

m

**Performs exploratory data inspection.**


In [None]:
print("ðŸ“Š Descriptive Statistics\n")
display(gdf[final_list_net + ctrl_vars].describe().T.round(3))

**Missing values.**

**Note: Any missing values must be handled before PCA analysis.**


In [51]:
missing_values = gdf[final_list_net + ctrl_vars].isnull().sum()
missing_values_filtered = missing_values[missing_values > 0]
if len(missing_values_filtered) == 0:
    print("No missing values in any feature")
else:
    print(missing_values_filtered)
    print("\n \033[1mFor further processing, missing values must be handled!\033[0m")

home_own       1
sinfamrate     1
sinfamage     15
sinfamsize    13
salesprice     3
income         6
race_white     1
dtype: int64

 [1mFor further processing, missing values must be handled![0m


In [52]:
cols = final_list_net + ctrl_vars
gdf = gdf.dropna(subset=cols)

**Generates visualization plots for diagnostics or distributions.**


In [None]:
# Plotting histogram and bar plots of each variable
numerical_columns = gdf[final_list_net + ctrl_vars].select_dtypes(include=['number']).columns

print("histogram and bar plots of each variable")

for col in numerical_columns:
    print(col)
    print('Skew :', round(gdf[final_list_net + ctrl_vars][col].skew(), 2))
    plt.figure(figsize = (10, 4))
    plt.subplot(1, 2, 1)
    sns.distplot(gdf[final_list_net + ctrl_vars][col], axlabel=col)
    plt.subplot(1, 2, 2)
    sns.boxplot(x = gdf[final_list_net + ctrl_vars][col])
    plt.show()

**Cell 9: Generates visualization plots for diagnostics or distributions.**


In [None]:
# Define the subset of columns
selected_cols = ["id"] + final_list_net + ctrl_vars
sub = gdf[selected_cols]

# Identify numeric columns (excluding the ID if needed)
numerical_columns = sub.select_dtypes(include='number').columns.tolist()

if "id" in numerical_columns:
    numerical_columns.remove("id")

# Plotting boxplots
for col in numerical_columns:
    print(f"Boxplot for {col}")

    plt.figure(figsize=(10, 5))

    # Boxplot
    sns.boxplot(data=sub, x=col)

    # Overlaid raw points
    sns.stripplot(data=sub, x=col, color='black', jitter=True, alpha=0.5)

    # Add ID labels
    for i in range(sub.shape[0]):
        plt.text(
            x=sub[col].iloc[i],
            y=0.02,
            s=str(sub["id"].iloc[i]),
            horizontalalignment='center',
            size='small',
            color='blue',
            rotation=45
        )

    plt.title(f"Boxplot with Observation IDs for {col}")
    plt.tight_layout()
    plt.show()

**Cell 11: Computes correlation matrix for control variables.**


In [None]:
# Correlation heatmap for control variables (compact)
corr = gdf[ctrl_vars].corr()
plt.figure(figsize=(12,12))
sns.heatmap(corr, annot=True, cmap="coolwarm", center=0, fmt=".2f")
plt.title("Control Variables Correlation")
plt.show()

**Cell 14: Runs PCA on street network variables and extracts components.**


In [None]:
unrotated_loading_net,rotated_loading_net, communalities_net, unrotated_net_scores, rotated_net_scores, unrot_net_eigenvalues, rotated_variance_share_net = factor_loading_matrices(gdf, final_list_net, n_factors=pca_factors_network)

**Cell 15: Runs PCA on control variables and extracts components.**


In [None]:
unrotated_loading_ctrl,rotated_loading_ctrl, communalities_ctrl, unrotated_ctrl_scores, rotated_ctrl_scores, unrot_ctrl_eigenvalues, rotated_variance_share_ctrl = factor_loading_matrices(gdf, ctrl_vars, n_factors=pca_factors_ctrl)

In [66]:
rotated_net_scores.columns = [f"PC{i+1}_net" for i in range(rotated_net_scores.shape[1])]
rotated_ctrl_scores.columns = [f"PC{i+1}_ctrl" for i in range(rotated_ctrl_scores.shape[1])]

In [72]:
final_gdf = pd.concat([gdf[["id", "geometry"]].reset_index(),rotated_net_scores, rotated_ctrl_scores], axis=1)
final_gdf.to_file('../data/final_dataset/final_pca.shp')

In [None]:
!jupyter nbconvert --to html Spatial_Anaysis.ipynb