In [None]:
# Dependencies
from matplotlib import pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd
from pathlib import Path

In [None]:
# set path
crime_all_path = "crime-data/combined_wacrime.csv"

# read the csv file into pandas and remove unnamed columns
crime_all_df = pd.read_csv(crime_all_path, index_col=0, encoding="UTF-8")
crime_all_df.loc[:, ~crime_all_df.columns.str.contains('Unnamed')]
crime_all_df = crime_all_df.dropna(how="any", axis=0)

In [None]:
# drop the total columns
crime_all_df_totals = crime_all_df[['Month and Year',
                                    'Homicide Total', 
                                   'Recent Sexual Offence Total', 
                                   'Historical Sexual Offence Total',
                                   'Assault (Family) Total',
                                   'Assault (Non-Family) Total',
                                   'Threatening Behaviour (Family) Total',
                                   'Threatening Behaviour (Non-Family) Total',
                                   'Deprivation of Liberty Total',
                                   'Robbery Total',
                                   'Burglary Total',
                                   'Stealing of Motor Vehicle',
                                   'Stealing Total',
                                   'Property Damage Total',
                                   'Arson Total',
                                   'Drug Offences Total',
                                   'Receiving and Possession of Stolen Property Total',
                                   'Regulated Weapons Offences',
                                   'Graffiti',
                                   'Fraud & Related Offences Total',
                                   'Breach of Violence Restraint Order Total'
                                    ]]
crime_all_df_totals

In [None]:
# drop null values and convert dtype
no_date = crime_all_df_totals.drop(columns=["Month and Year"])
no_date = no_date.astype(int)
no_date

In [None]:
#count the total numbers of each type of crime
count_totals = no_date.loc[:, "Homicide Total" : "Breach of Violence Restraint Order Total"].sum()
count_totals

In [None]:
round(count_totals.describe(), 2)

In [None]:
#display above results in bar graph from higest to lowest
totals_plot = count_totals.sort_values().plot(kind='barh', figsize=(8,10), alpha=1, align="edge")
plt.ylabel("Type of crime")
plt.xlabel("Total number")
plt.title("Highest to lowest type of crimes in WA from Jan 07 - Sep 23")

# save the fig
plt.savefig("crime-data/Fig1.png", bbox_inches="tight")

plt.show()

In [None]:
# find the top highest totals and create a string for the remaining
high_five = count_totals.sort_values(ascending=False).head(5)
other_highfive = count_totals.sort_values(ascending=False).tail(15).sum()

In [None]:
# enter above found data in a list and plot as pie to visualise top 5 and a slice for all others
slices = ['Stealing Total', 'Property Damage Total ', 'Burglary Total', 'Drug Offences Total', 'Fraud & Related Offences Total', 'Others']
numbers_top = [2601045, 1096425, 1063386, 783887, 690245, 2384598]
colors = ["yellowgreen", "red", "deepskyblue", "lightcoral", "purple", "slategrey"]

plt.pie(numbers_top, labels=slices, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)

plt.axis("equal")
plt.title("Top 5 crimes in WA 2007-23")
plt.savefig("crime-data/Fig2.png", bbox_inches="tight")
plt.show()

In [None]:
#group all areas and drop metroplitan/regional as its overlapping
region_grouped = no_date.groupby(['Region'])
region_totals = region_grouped.sum().transpose()
region_totals = region_totals.drop(columns=["Metropolitan", "Regional"])
region_totals

In [None]:
# create a stacked bar graph for all types against areas
region_totals = pd.DataFrame(region_totals)

region_totals.plot(kind="barh", stacked=True, figsize=(10,12), alpha=1)
plt.title("Numbers of different types of crimes displayed by areas 2007-2023")
plt.xlabel("Number of each crime")
plt.ylabel("Types of crime")
plt.savefig("crime-data/Fig3.png", bbox_inches="tight")
plt.show()

In [None]:
# Rename Month and year column to Date
crime_all_df_totals = crime_all_df_totals.rename(columns={'Month and Year': 'Date'})

# Convert Date column to date time
crime_all_df_totals['Date'] = pd.to_datetime(crime_all_df_totals['Date'], format='%b-%y')

# Create new columns for Month and Year
crime_all_df_totals['Month'] = crime_all_df_totals['Date'].dt.month
crime_all_df_totals['Year'] = crime_all_df_totals['Date'].dt.year

# Remove the date and month as not relevant to my questions
new_df = crime_all_df_totals.drop(columns=["Month", "Date"], axis=0)

# Set index to Year
new_df = new_df.set_index("Year")
new_df

In [None]:
# Convert to int type
new_df = new_df.astype(int)
# Group the years
years_df = new_df.groupby(["Year"])
# Grouped year_wacrime_df
years_df = years_df.sum().head(17)
years_df

In [None]:
round(years_df.describe(), 2)

In [None]:
# create a stacked bar chart for types against years
plotyear = years_df.transpose()

plottear = pd.DataFrame(plotyear)

plotyear.plot(kind="barh", stacked=True, figsize=(10,12), alpha=1)
plt.title("Numbers of different types of crimes displayed by years from 2007-2023")
plt.xlabel("Number of each crime")
plt.ylabel("Types of crime")
plt.savefig("crime-data/Fig4.png", bbox_inches="tight")
plt.show()

In [None]:
#summary stats
count_totals
# Determine which measure of central tendency is most appropriate to describe the data
# Determine if there are any potential outliers in the average occupancy in California
quartiles = count_totals.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of totals of crime types is: {lowerq}")
print(f"The upper quartile of totals of crime types is: {upperq}")
print(f"The interquartile range of totals of crime types is: {iqr}")
print(f"The the median of total number of types of crime is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")
print("There are no outliers")