# Crime data in context to immigration and population data

Two things that impact the data is the:

* 2015 - 2020 there was a change in policing strategy to Frontline 2020
* in 2020 there was a sharp dip due to COIVD-19
* There does look like there is a seasonal swing with the data but could not find the cause doesn't seem to be weather related.

https://www.police.wa.gov.au/Crime/CrimeStatistics
https://www.abs.gov.au/statistics/people/population/national-state-and-territory-population/latest-release
https://www.abs.gov.au/statistics/people/population/migration-australia/latest-release

https://www.perthnow.com.au/news/wa/public-satisfaction-with-police-up-but-new-model-needs-improving-ng-36a3bbb3119fc33453283c45345f159a
https://www.abc.net.au/news/2016-02-11/wa-police-commissioner-backs-down-on-new-policing-model/7159736
https://wamnnews.com.au/news/wa-police-overhauls-front-line-model-to-combat-rising-crime-rate/
https://thewest.com.au/politics/law-and-order/secret-police-files-liza-harvey-says-frontline-2020-policing-model-was-never-given-a-chance-ng-b881236411z
https://www.abc.net.au/news/2020-04-08/coronavirus-shutdown-sees-crime-rate-drop-in-wa/12132410

In [50]:
# Dependencies
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
from scipy import stats
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.stats import linregress
import requests
import hvplot.pandas

# Import the OpenWeatherMap API key
from api_keys import geoapify_key

In [51]:
# functions ##

# load and clean crime datasets
def load_crime_data(datasets, path):
    df = pd.DataFrame()
    for dataset in datasets:
        # load the csv file
        file_path = Path(f"{path}{dataset}.csv")
        temp_df = pd.read_csv(file_path)
        # add the region
        temp_df["region"] = dataset
        df = pd.concat([df, temp_df])
    return df

In [53]:
# Load crime data set into pandas

regions = ["goldfields-esperance", "mandurah", "mirrabooka", "south_west", "armadale", "great_southern", "perth", "wheatbelt", "cannington", "joondalup", "mid_west-gascoyne", "plibara", "fremantle", "kimberley", "midland"]
metro_region = ["metropolitan", "regional"]

crime_all_df = load_crime_data(regions, 'crime-data/wa_crime_')
crime_metro_region_all_df = load_crime_data(metro_region, 'crime-data/wa_crime_')

crime_metro_region_all_df.head()

In [54]:
# convert the dates
dates = pd.to_datetime(crime_metro_region_all_df['Month and Year'], format='%b-%y')
last_days = dates + pd.offsets.MonthEnd(1)
crime_metro_region_all_df['Month and Year'] = last_days
crime_metro_region_all_df = crime_metro_region_all_df.rename(columns={'Month and Year': 'Date'})
crime_metro_region_all_df = crime_metro_region_all_df.set_index('Date')

In [55]:
file_path = Path("other-data/immigration-data.csv")
immigration_df = pd.read_csv(file_path)
immigration_df = immigration_df[['DateTime', 'WA']]
dates = pd.to_datetime(immigration_df['DateTime'], format='ISO8601')
immigration_df['DateTime'] = dates
immigration_df = immigration_df.rename(columns={'DateTime': 'Date', 'WA': 'Total WA immigration'})
immigration_df['Date1'] = immigration_df['Date'].copy()

immigration_df = immigration_df.set_index('Date')

immigration_df.head()

In [56]:
# change to quarterly data & resize the dataset to fit
crime_quarterly_df = crime_metro_region_all_df.resample('Q').sum()
crime_quarterly_df = crime_quarterly_df.loc['2010-06-01':'2022-07-01']

#remove the unwanted columns # crime_metro_region_all_df.columns.to_list() # to get all the columns
crime_quarterly_df = crime_quarterly_df.drop(columns=[
 #'Homicide Total',
 'Murder',
 'Attempted / Conspiracy to Murder',
 'Manslaughter',
 'Driving Causing Death',
 #'Recent Sexual Offence Total',
 'Sexual Assault',
 'Non-Assaultive Sexual Offences',
 #'Historical Sexual Offence Total',
 'Sexual Assault.1',
 'Non-Assaultive Sexual Offences.1',
 #'Assault (Family) Total',
 'Serious Assault (Family)',
 'Common Assault (Family)',
 #'Assault (Non-Family) Total',
 'Serious Assault (Non-Family)',
 'Common Assault (Non-Family)',
 'Assault Police Officer',
 #'Threatening Behaviour (Family) Total',
 'Threatening Behaviour (Family)',
 'Possess Weapon to Cause Fear (Family)',
 #'Threatening Behaviour (Non-Family) Total',
 'Threatening Behaviour (Non-Family)',
 'Possess Weapon to Cause Fear (Non-Family)',
 #'Deprivation of Liberty Total',
 'Kidnapping / Child Stealing',
 'Deprivation of Liberty',
 #'Robbery Total',
 'Robbery (Business)',
 'Robbery (Non-Business)',
 'Unnamed: 31',
 #'Burglary Total',
 'Burglary (Dwelling)',
 'Burglary (Non-Dwelling)',
 'Stealing of Motor Vehicle',
 #'Stealing Total',
 'Stealing From Motor Vehicle (Contents or Parts)',
 'Stealing From Retail Premises (Shoplift)',
 'Stealing From Dwelling',
 'Stealing From Other Premises or Place',
 'Stealing as a Servant',
 'Stealing (Not Elsewhere Classified)',
 #'Property Damage Total',
 'Criminal Damage',
 'Damage',
 #'Arson Total',
 'Cause Bushfire',
 'Cause Damage by Fire',
 'Other Fire Related Offences',
 'Unnamed: 50',
 'Unnamed: 51',
 #'Drug Offences Total',
 'Drug Dealing',
 'Drug Possession',
 'Possession of Drug Paraphernalia',
 'Cultivate or Manufacture Drugs',
 'Other Drug Offences',
 #'Receiving and Possession of Stolen Property Total',
 'Possess Stolen Property',
 'Receiving Stolen Property',
 'Regulated Weapons Offences',
 'Unnamed: 62',
 'Graffiti',
 #'Fraud & Related Offences Total',
 'Forgery',
 'Fraud (Credit Card)',
 'Fraud (Not Elsewhere Classified)',
 #'Breach of Violence Restraint Order Total',
 'Breach of Family Violence Restraint Order',
 'Breach of Violence Restraint Order',
 'Breach of Police Order',
 'Total Selected Miscellaneous Offences',
 'region'
 ])

# add all the crime together
crime_quarterly_df['Total Crime'] = crime_quarterly_df.sum(axis=1)
crime_quarterly_df.head()

In [57]:
#Summary Statistics
# max, min, mean, mediam, variance, standard deviation for each year
crime_yearly_descriptive_stats_df = crime_quarterly_df.copy()
crime_yearly_descriptive_stats_df = crime_yearly_descriptive_stats_df.groupby(crime_yearly_descriptive_stats_df.index.year)['Total Crime'].agg(['max', 'min', 'mean', 'median', 'var', 'std'])
crime_yearly_descriptive_stats_df

In [58]:
# merge the datasets based on dates
immigration_vs_crime_df = crime_quarterly_df.merge(immigration_df, on="Date").copy()
immigration_vs_crime_df = immigration_vs_crime_df[['Total Crime', 'Total WA immigration', 'Date1']]
immigration_vs_crime_df['Total Crime'] = immigration_vs_crime_df['Total Crime'].astype(int)
immigration_vs_crime_df.head()


In [59]:
# create a line graph
fig, ax1 = plt.subplots() 

fig.set_size_inches(20, 5)

ax1.set_xlabel('Date') 
ax1.set_ylabel('Total Crime', color = 'red') 
ax1.plot(immigration_vs_crime_df['Date1'],immigration_vs_crime_df['Total Crime'], color = 'red') 
ax1.tick_params(axis ='y', labelcolor = 'red') 

# Adding Twin Axes

ax2 = ax1.twinx() 
  
ax2.set_ylabel('WA Immigration', color = 'blue') 
ax2.plot(immigration_vs_crime_df['Date1'],immigration_vs_crime_df['Total WA immigration'], color = 'blue')
ax2.tick_params(axis ='y', labelcolor = 'blue') 


# set monthly locator
ax1.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
# set formatter
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b-%Y'))
# set font and rotation for date tick labels
plt.gcf().autofmt_xdate()
# Show plot
plt.show()

In [60]:
# Plot out rooms versus median house price
y_values = immigration_vs_crime_df['Total Crime']
x_values = immigration_vs_crime_df['Total WA immigration']

# Print out the r-squared value along with the plot.
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(30000,73000),fontsize=10,color="red")
plt.ylabel('Total WA Crime')
plt.xlabel('Total WA Immigration')
print(f"The r-squared is: {rvalue**2}")
plt.show()

In [61]:
file_path = Path("other-data/population-data.csv")
population_df = pd.read_csv(file_path)

population_df = population_df.rename(columns={'Unnamed: 0': 'Date', 'Estimated Resident Population ;  Persons ;  Western Australia ;': 'WA Population'})
population_df = population_df.iloc[10:]

dates = pd.to_datetime(population_df['Date'], format='%b-%Y')
last_days = dates + pd.offsets.MonthEnd(1)
population_df['Date'] = last_days
population_df['Date1'] = population_df['Date'].copy()

population_df = population_df[['Date', 'WA Population', 'Date1']]
population_df = population_df.set_index('Date')

population_df.head()

In [62]:
population_vs_crime_df = crime_quarterly_df.merge(population_df, on="Date").copy()

population_vs_crime_df = population_vs_crime_df[['Total Crime', 'WA Population', 'Date1']]
population_vs_crime_df['Total Crime'] = population_vs_crime_df['Total Crime'].astype(int)
population_vs_crime_df['WA Population'] = population_vs_crime_df['WA Population'].astype(int)

population_vs_crime_df['WA Population'] = population_vs_crime_df['WA Population']

population_vs_crime_df.head()


In [63]:
fig, ax1 = plt.subplots() 

fig.set_size_inches(20, 5)

ax1.set_xlabel('Date') 
ax1.set_ylabel('Total Crime', color = 'red')
ax1.plot(population_vs_crime_df['Date1'],population_vs_crime_df['Total Crime'], color = 'red')
ax1.tick_params(axis ='y', labelcolor = 'red')

# Adding Twin Axes

ax2 = ax1.twinx()
  
ax2.set_ylabel('WA Population', color = 'blue') 
ax2.plot(population_vs_crime_df['Date1'],population_vs_crime_df['WA Population'], color = 'blue')
ax2.tick_params(axis ='y', labelcolor = 'blue') 
 
# set monthly locator
ax1.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
# set formatter
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b-%Y'))
# set font and rotation for date tick labels
plt.gcf().autofmt_xdate()

# Show plot

plt.show()

In [64]:
# Plot out rooms versus median house price
y_values = population_vs_crime_df['Total Crime']
x_values = population_vs_crime_df['WA Population']

# Print out the r-squared value along with the plot.
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(2300000, 73000),fontsize=10,color="red")
plt.ylabel('Total WA Crime')
plt.xlabel('Total WA Population')
print(f"The r-squared is: {rvalue**2}")
plt.show()

In [65]:
csv_file = Path("crime-data/combined_wacrime.csv")
city_data_df = pd.read_csv(csv_file)


#remove last row from data frame
city_data_df.drop(city_data_df.tail(1).index,inplace=True)
new_crime_data = city_data_df.fillna(0)

# Display sample data
new_crime_data 

In [66]:
#check for columns to be removed
new_crime_data.columns

In [67]:
#Clean Data - Drop date and columns before using groupby
no_date_df = new_crime_data.drop(['Month and Year','Homicide Total','Recent Sexual Offence Total',
                                  'Historical Sexual Offence Total','Assault (Family) Total','Assault (Non-Family) Total',
                                 'Threatening Behaviour (Family) Total','Threatening Behaviour (Non-Family) Total',
                                 'Deprivation of Liberty Total', 'Robbery Total','Burglary Total', 'Stealing Total', 
                                 'Property Damage Total', 'Arson Total','Drug Offences Total', 
                                 'Receiving and Possession of Stolen Property Total', 'Fraud & Related Offences Total',
                                 'Breach of Violence Restraint Order Total', 'Total Selected Miscellaneous Offences',
                                 'Unnamed: 32','Unnamed: 52', 'Unnamed: 51', 'Unnamed: 63' ], axis = 1)

#Group by region
grouped_data = no_date_df.groupby(['Region']).sum()

#Add a final column that is sum of all crimes in row
grouped_data['Total']=grouped_data.iloc[:,1:-1].sum(axis=1)

# Display sample data
grouped_data

In [68]:
#Check if cleaned properly
grouped_data.columns

In [69]:
#Add Latitude and Longitude
grouped_data["Lat"] = ""
grouped_data["Lon"] = ""
grouped_data

In [70]:
#Reset Index and change name so Geo_apify can find
indexed_data = grouped_data.reset_index()
fixed_data = indexed_data.replace('Mid_West_Gascoyne', 'West_Gascoyne')
fixed_data

In [71]:
# Define the API parameters
params = {
    "apiKey":geoapify_key,
    "format":"json"
}

# Set the base URL
base_url = "https://api.geoapify.com/v1/geocode/search"

In [72]:
# Loop through the cities_pd DataFrame and search coordinates for each city
for index, row in fixed_data.iterrows():

    # Get the city's name & add ", Australia" to the string so geoapify finds the correct city
    city = row["Region"]  + ", WA, Australia"

    # Add the current city to the parameters
    params["text"] = f"{city}"

    # Make the API request
    response = requests.get(base_url, params=params)
    
    # Convert reponse to JSON
    response = response.json()

    # Extract latitude and longitude
    fixed_data.loc[index, "Lat"] = response["results"][0]["lat"]
    fixed_data.loc[index, "Lon"] = response["results"][0]["lon"]

# Display sample data to confirm that the coordinates appear
fixed_data

In [73]:
%%capture --no-display
# Configure the map plot
map_plot = fixed_data.hvplot.points(
    "Lon",
    "Lat",
    geo = True,
    tiles = "OSM",
    frame_width = 840,
    frame_height = 600,
    size = "Total",
    scale = 0.01,
    color = "Region"
)

# Display the map plot
map_plot

In [74]:
#Drop values that are outside Main Perth
dropped_data = fixed_data.drop([3, 6, 9, 13, 16])
dropped_data

In [75]:
%%capture --no-display
# Configure the map plot
map_plot_2 = dropped_data.hvplot.points(
    "Lon",
    "Lat",
    geo = True,
    tiles = "OSM",
    frame_width = 840,
    frame_height = 600,
    size = "Total",
    scale = 0.05,
    color = "Region"
)

# Display the map plot
map_plot_2

In [76]:
#Sort data by total and drop regional and Metro r
bigsmall_data = fixed_data.sort_values(by=['Total'])
small_big = bigsmall_data.drop([8, 14])
small_big

In [77]:
%%capture --no-display
# Configure the map plot
map_plot_3 = small_big.hvplot.points(
    "Lon",
    "Lat",
    geo = True,
    tiles = "OSM",
    frame_width = 840,
    frame_height = 600,
    size = "Total",
    scale = 0.01,
    color = "Region"
)


# Display the map plot
map_plot_3

In [78]:
#Drop regional data
dropped_data2 = small_big.drop([3, 4, 6, 9, 13, 16])
dropped_data2

In [79]:
%%capture --no-display
# Configure the map plot
map_plot_4 = dropped_data2.hvplot.points(
    "Lon",
    "Lat",
    geo = True,
    tiles = "OSM",
    frame_width = 840,
    frame_height = 600,
    size = "Total",
    scale = 0.05,
    color = "Region"
)

# Display the map plot
map_plot_4

In [80]:
#Order data from largest to smallest
big_small = small_big.iloc[::-1]
big_small

In [81]:
# Set x axis and tick locations
x_axis = np.arange(len(big_small))
tick_locations = [value+0.4 for value in x_axis]

 # Create a list indicating where to write x labels and set figure size to adjust for space

plt.figure(figsize=(8,5))
plt.bar(x_axis, big_small["Total"], color='r', alpha=0.5, align="edge")
plt.xticks(tick_locations, big_small["Region"], rotation="vertical")

 # Set x and y limits
plt.xlim(-0.15, len(x_axis)-0.05)
plt.ylim(0, max(big_small["Total"])+10000)

 # Set a Title and labels
plt.title("Total Crimes in Each Region")
plt.xlabel("Region")
plt.ylabel("Total Crimes")

#Line for average and median
mean = small_big["Total"].mean()
median = small_big["Total"].median()
plt.axhline(mean)
plt.axhline(median, color = "black")
print(mean)

plt.show

In [82]:
#drop regional datapoints
metro_bigsmall = big_small.drop([3, 4, 6, 9, 13, 16])

metro_bigsmall

In [83]:
# Set x axis and tick locations
x_axis = np.arange(len(metro_bigsmall))
tick_locations = [value+0.4 for value in x_axis]

 # Create a list indicating where to write x labels and set figure size to adjust for space

plt.figure(figsize=(8,5))
plt.bar(x_axis, metro_bigsmall["Total"], color='r', alpha=0.5, align="edge")
plt.xticks(tick_locations, metro_bigsmall["Region"], rotation="vertical")

 # Set x and y limits
plt.xlim(-0.15, len(x_axis)-0.05)
plt.ylim(0, max(metro_bigsmall["Total"])+10000)

 # Set a Title and labels
plt.title("Total Crimes in Each Region")
plt.xlabel("Region")
plt.ylabel("Total Crimes")

#Line for average and median
mean = metro_bigsmall["Total"].mean()
median = metro_bigsmall["Total"].median()
plt.axhline(mean)
plt.axhline(median, color = "black")
print(mean)

plt.show