In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import seaborn as sns
import matplotlib.pyplot as plt
from skbio.stats.composition import ilr
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [None]:
# ## 1. Data Preparation & Compositional Setup


In [None]:
# Load occupancy data
occupancy = pd.read_excel("../data/housing/occupancy_rating_bedrooms_wards.xlsx", sheet_name="2021")

# Filter out City of London
occupancy = occupancy[occupancy['local authority code'] != 'E09000001']

# Calculate percentages
categories = ['Occupancy rating: +2 or more', '+1', '0', '-1', '-2 or less']
total_households = occupancy['All Households']

for cat in categories:
    occupancy[f'pct_{cat}'] = occupancy[cat] / total_households

# Add small epsilon to handle zeros
occupancy[categories] = occupancy[categories] + 1e-6


In [None]:
# ## 2. Burglary Data Processing


In [None]:
burglary = pd.read_csv("../data/burglary.csv")
burglary = burglary[burglary['Year'] == 2021]

burglary_df = burglary.groupby('Ward Code').size().reset_index(name='burglary_count')
total = burglary_df['burglary_count'].sum()
burglary_df['burglary_pct'] = (burglary_df['burglary_count'] / total) * 100



In [None]:
# ## 3. Data Merging


In [None]:
merged = pd.merge(
    occupancy,
    burglary_df,
    left_on="ward code",
    right_on="Ward Code",
    how="inner"
)

# Verify composition sums
merged['check_sum'] = merged[[f'pct_{cat}' for cat in categories]].sum(axis=1)
assert np.allclose(merged['check_sum'], 1.0, atol=1e-3), "Composition percentages don't sum to 1!"


In [None]:
# ## 4. Compositional Data Analysis (CoDa)


In [None]:
# ILR Transformation
composition = merged[[f'pct_{cat}' for cat in categories]].values
ilr_transformed = ilr(composition)

# Create ILR coordinates
n_ilr = ilr_transformed.shape[1]
ilr_cols = [f'ILR_{i+1}' for i in range(n_ilr)]
merged_ilr = merged.join(pd.DataFrame(ilr_transformed, columns=ilr_cols))


In [None]:
# ## 5. Multivariate Regression Analysis


In [None]:
# OLS with ILR coordinates
X_ilr = sm.add_constant(merged_ilr[ilr_cols])
y = merged_ilr['burglary_pct']

model_ilr = sm.OLS(y, X_ilr).fit()
print("=== ILR Regression Results ===")
print(model_ilr.summary())

# Ridge Regression with Original Categories
X = merged[[f'pct_{cat}' for cat in categories]]
scaler = StandardScaler().fit(X)
ridge = Ridge(alpha=1.0).fit(scaler.transform(X), y)

print("\n=== Ridge Regression Coefficients ===")
print(pd.Series(ridge.coef_, index=X.columns).sort_values(ascending=False))


In [None]:
# ## 6. Visualization & Interpretation


In [None]:
# Correlation Matrix
corr_matrix = merged[[f'pct_{cat}' for cat in categories] + ['burglary_pct']].corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix: Occupancy Categories vs Burglary Percentage")
plt.show()


In [None]:
# ## 7. Advanced Compositional Visualization


In [None]:
from pycompositions import plot_ternary

# Focus on three main components
ternary_data = merged[['pct_Occupancy rating: +2 or more',
                       'pct_0',
                       'pct_-2 or less']].values

plt.figure(figsize=(10,8))
plot_ternary(
    ternary_data,
    names=['+2', '0', '-2'],
    c=merged['burglary_pct'],
    cmap='viridis',
    colorbar_label='Burglary Percentage'
)
plt.title("Occupancy Composition vs Burglary Risk")
plt.show()


In [None]:
# ## 8. Ethical Consideration Analysis


In [None]:
# Check relationship with deprivation
plt.figure(figsize=(10,6))
sns.scatterplot(
    x='pct_-2 or less',
    y='burglary_pct',
    hue='local authority name',
    data=merged,
    palette='tab20',
    s=100
)
plt.title("High Overcrowding (-2) vs Burglary Percentage by Borough")
plt.xlabel("Percentage of Households with -2 Occupancy Rating")
plt.ylabel("Burglary Percentage Contribution")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# ## 9. Mixed Effects Modeling


In [None]:
import statsmodels.formula.api as smf

# Account for borough-level variability
mixed_model = smf.mixedlm(
    "burglary_pct ~ pct_Occupancy rating: +2 or more + pct_0 + pct_-1 + pct_-2 or less",
    data=merged,
    groups=merged["local authority name"]
).fit()

print("\n=== Mixed Effects Model Results ===")
print(mixed_model.summary())