In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr, chi2_contingency
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

In [None]:
# Load burglary data
burglary = pd.read_csv("data/burglary.csv")
burglary = burglary[burglary['Year'] == 2015]

burglary_df = burglary.groupby('Ward name').size().reset_index(name='burglary_count')
total = burglary_df['burglary_count'].sum()
burglary_df['burglary_pct'] = (burglary_df['burglary_count'] / total) * 100

# Load the demographic data
demographics_df = pd.read_excel("data/ward-profiles-excel-version.xls", sheet_name="Data")

# Filter out City of London and other garbage data from demographics data
to_filter = ["City of London", "London", "England", "Source:"]
demographics_df = demographics_df[~demographics_df["Ward name"].isin(to_filter)]
# demographics_df = demographics_df.rename(columns={'New code': 'Ward Code'})

# Convert data types as needed
demographics_df['Population - 2015'] = pd.to_numeric(demographics_df['Population - 2015'], errors='coerce')
demographics_df['Children aged 0-15 - 2015'] = pd.to_numeric(demographics_df['Children aged 0-15 - 2015'], errors='coerce')
demographics_df['Working-age (16-64) - 2015'] = pd.to_numeric(demographics_df['Working-age (16-64) - 2015'], errors='coerce')
demographics_df['Older people aged 65+ - 2015'] = pd.to_numeric(demographics_df['Older people aged 65+ - 2015'], errors='coerce')
demographics_df['% All Children aged 0-15 - 2015'] = pd.to_numeric(demographics_df['% All Children aged 0-15 - 2015'], errors='coerce')
demographics_df['% All Working-age (16-64) - 2015'] = pd.to_numeric(demographics_df['% All Working-age (16-64) - 2015'], errors='coerce')
demographics_df['% All Older people aged 65+ - 2015'] = pd.to_numeric(demographics_df['% All Older people aged 65+ - 2015'], errors='coerce')

# Merge demographic data with burglary rate
analysis_df = pd.merge(demographics_df, burglary_df, on='Ward name', how='left')

analysis_df.head(20)


In [None]:
children_correlation, p_value = pearsonr(
    analysis_df['% All Children aged 0-15 - 2015'],
    analysis_df['burglary_pct']
)

print(f"Pearson correlation between % children and burglary rate: {children_correlation:.4f} (p-value: {p_value:.4f})")

# Calculate Spearman correlation (non-parametric, handles outliers better)
spearman_corr, spearman_p = spearmanr(
    analysis_df['% All Children aged 0-15 - 2015'],
    analysis_df['burglary_pct']
)

print(f"Spearman correlation between % children and burglary rate: {spearman_corr:.4f} (p-value: {spearman_p:.4f})")

# Linear Regression: Burglary Rate vs % Children
X = analysis_df[['% All Children aged 0-15 - 2015']]
y = analysis_df['burglary_pct']

# Add constant for intercept
X_with_const = sm.add_constant(X)
model = sm.OLS(y, X_with_const).fit()

print("\nLinear Regression Results: Burglary Rate vs % Children")
print(model.summary())

# Create scatter plot
plt.figure(figsize=(10, 6))
sns.regplot(
    x='% All Children aged 0-15 - 2015',
    y='burglary_pct',
    data=analysis_df,
    scatter_kws={'alpha':0.5},
    line_kws={'color':'red'}
)
plt.title('Relationship Between Child Population % and Burglary Rate')
plt.xlabel('Children aged 0-15 (% of population)')
plt.ylabel('Burglary Rate per 1000 Population')
plt.tight_layout()
plt.show()