In [1]:
# Dependencies
from matplotlib import pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
# set path
crime_all_path = "crime-data/combined_wacrime.csv"

# read the csv file into pandas and remove unnamed columns
crime_all_df = pd.read_csv(crime_all_path, index_col=0, encoding="UTF-8")
crime_all_df.loc[:, ~crime_all_df.columns.str.contains('Unnamed')]
crime_all_df = crime_all_df.dropna(how="any", axis=0)

In [27]:
# drop the total columns
crime_all_df_totals = crime_all_df[['Month and Year',
                                    'Homicide Total', 
                                   'Recent Sexual Offence Total', 
                                   'Historical Sexual Offence Total',
                                   'Assault (Family) Total',
                                   'Assault (Non-Family) Total',
                                   'Threatening Behaviour (Family) Total',
                                   'Threatening Behaviour (Non-Family) Total',
                                   'Deprivation of Liberty Total',
                                   'Robbery Total',
                                   'Burglary Total',
                                   'Stealing of Motor Vehicle',
                                   'Stealing Total',
                                   'Property Damage Total',
                                   'Arson Total',
                                   'Drug Offences Total',
                                   'Receiving and Possession of Stolen Property Total',
                                   'Regulated Weapons Offences',
                                   'Graffiti',
                                   'Fraud & Related Offences Total',
                                   'Breach of Violence Restraint Order Total'
                                    ]]
crime_all_df_totals

In [28]:
# drop null values and convert dtype
no_date = crime_all_df_totals.drop(columns=["Month and Year"])
no_date = no_date.astype(int)
no_date

In [29]:
#count the total numbers of each type of crime
count_totals = no_date.loc[:, "Homicide Total" : "Breach of Violence Restraint Order Total"].sum()
count_totals

In [30]:
round(count_totals.describe(), 2)

In [31]:
#display above results in bar graph from higest to lowest
totals_plot = count_totals.sort_values().plot(kind='barh', figsize=(8,10), alpha=1, align="edge")
plt.ylabel("Type of crime")
plt.xlabel("Total number")
plt.title("Highest to lowest type of crimes in WA from Jan 07 - Sep 23")

# save the fig
plt.savefig("crime-data/Fig1.png", bbox_inches="tight")

plt.show()

In [33]:
# find the top highest totals and create a string for the remaining
high_five = count_totals.sort_values(ascending=False).head(5)
other_highfive = count_totals.sort_values(ascending=False).tail(15).sum()

In [32]:
# enter above found data in a list and plot as pie to visualise top 5 and a slice for all others
slices = ['Stealing Total', 'Property Damage Total ', 'Burglary Total', 'Drug Offences Total', 'Fraud & Related Offences Total', 'Others']
numbers_top = [2601045, 1096425, 1063386, 783887, 690245, 2384598]
colors = ["yellowgreen", "red", "deepskyblue", "lightcoral", "purple", "slategrey"]

plt.pie(numbers_top, labels=slices, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)

plt.axis("equal")
plt.title("Top 5 crimes in WA 2007-23")
plt.savefig("crime-data/Fig2.png", bbox_inches="tight")
plt.show()

In [34]:
#group all areas and drop metroplitan/regional as its overlapping
region_grouped = no_date.groupby(['Region'])
region_totals = region_grouped.sum().transpose()
region_totals = region_totals.drop(columns=["Metropolitan", "Regional"])
region_totals

In [35]:
# create a stacked bar graph for all types against areas
region_totals = pd.DataFrame(region_totals)

region_totals.plot(kind="barh", stacked=True, figsize=(10,12), alpha=1)
plt.title("Numbers of different types of crimes displayed by areas 2007-2023")
plt.xlabel("Number of each crime")
plt.ylabel("Types of crime")
plt.savefig("crime-data/Fig3.png", bbox_inches="tight")
plt.show()

In [36]:
# Rename Month and year column to Date
crime_all_df_totals = crime_all_df_totals.rename(columns={'Month and Year': 'Date'})

# Convert Date column to date time
crime_all_df_totals['Date'] = pd.to_datetime(crime_all_df_totals['Date'], format='%b-%y')

# Create new columns for Month and Year
crime_all_df_totals['Month'] = crime_all_df_totals['Date'].dt.month
crime_all_df_totals['Year'] = crime_all_df_totals['Date'].dt.year

# Remove the date and month as not relevant to my questions
new_df = crime_all_df_totals.drop(columns=["Month", "Date"], axis=0)

# Set index to Year
new_df = new_df.set_index("Year")
new_df

In [37]:
# Convert to int type
new_df = new_df.astype(int)
# Group the years
years_df = new_df.groupby(["Year"])
# Grouped year_wacrime_df
years_df = years_df.sum().head(17)
years_df

In [38]:
round(years_df.describe(), 2)

In [39]:
# create a stacked bar chart for types against years
plotyear = years_df.transpose()

plottear = pd.DataFrame(plotyear)

plotyear.plot(kind="barh", stacked=True, figsize=(10,12), alpha=1)
plt.title("Numbers of different types of crimes displayed by years from 2007-2023")
plt.xlabel("Number of each crime")
plt.ylabel("Types of crime")
plt.savefig("crime-data/Fig4.png", bbox_inches="tight")
plt.show()

In [40]:
#summary stats
count_totals
# Determine which measure of central tendency is most appropriate to describe the data
# Determine if there are any potential outliers in the average occupancy in California
quartiles = count_totals.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of totals of crime types is: {lowerq}")
print(f"The upper quartile of totals of crime types is: {upperq}")
print(f"The interquartile range of totals of crime types is: {iqr}")
print(f"The the median of total number of types of crime is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")
print("There are no outliers")

In [41]:
# Create a path variable to find and clean the data
combined_wacrime = Path('crime-data/combined_wacrime.csv')
combined_wacrime_df = pd.read_csv(combined_wacrime)

combined_wacrime_df = combined_wacrime_df[combined_wacrime_df.Region != 'Regional']
combined_wacrime_df = combined_wacrime_df[combined_wacrime_df.Region != 'Metropolitan']

# Rename Month and year column to Date
combined_wacrime_df['Date'] = pd.to_datetime(combined_wacrime_df['Month and Year'], format='%b-%y')

# Create new columns for Month and Year
combined_wacrime_df['Month'] = combined_wacrime_df['Date'].dt.month
combined_wacrime_df['Year'] = combined_wacrime_df['Date'].dt.year

# Drop all Nan Rows
combined_wacrime_df = combined_wacrime_df.dropna(how='any', axis=0)

combined_wacrime_df.head()

In [42]:
# Reduce columns (take out all total columns)
combined_wacrime_df = combined_wacrime_df[['Month', 'Year','Murder','Attempted / Conspiracy to Murder','Manslaughter',
                                            'Driving Causing Death','Sexual Assault','Non-Assaultive Sexual Offences',
                                            'Serious Assault (Family)','Common Assault (Family)','Serious Assault (Non-Family)',
                                            'Common Assault (Non-Family)','Assault Police Officer','Threatening Behaviour (Family)',
                                            'Possess Weapon to Cause Fear (Family)','Threatening Behaviour (Non-Family)',
                                            'Possess Weapon to Cause Fear (Non-Family)','Kidnapping / Child Stealing',
                                            'Deprivation of Liberty','Robbery (Business)','Robbery (Non-Business)',
                                            'Burglary (Dwelling)','Burglary (Non-Dwelling)','Stealing of Motor Vehicle',
                                            'Stealing From Motor Vehicle (Contents or Parts)',
                                            'Stealing From Retail Premises (Shoplift)','Stealing From Dwelling',
                                            'Stealing From Other Premises or Place','Stealing as a Servant',
                                            'Stealing (Not Elsewhere Classified)','Criminal Damage','Damage','Cause Bushfire',
                                            'Cause Damage by Fire','Other Fire Related Offences','Drug Dealing','Drug Possession',
                                            'Possession of Drug Paraphernalia','Cultivate or Manufacture Drugs','Other Drug Offences',
                                            'Possess Stolen Property','Receiving Stolen Property','Regulated Weapons Offences',
                                            'Graffiti','Forgery','Fraud (Credit Card)','Fraud (Not Elsewhere Classified)',
                                            'Breach of Family Violence Restraint Order','Breach of Violence Restraint Order',
                                            'Breach of Police Order']]
combined_wacrime_df

In [43]:
# Replace all Nan values with '0'
combined_wacrime_df = combined_wacrime_df.replace(np.nan, 0)

# Convert all values to integers
combined_wacrime_df = combined_wacrime_df.astype(int)

# Convert Month to month name
combined_wacrime_df['Month'] = pd.to_datetime(combined_wacrime_df['Month'], format='%m').dt.month_name().str.slice(stop=3)                                         
                           
# Group the years
year_wacrime_df = combined_wacrime_df.groupby(['Year'])
print(year_wacrime_df)

# Grouped year_wacrime_df
year_wacrime_df.sum().head(17)

In [44]:
# Total the number of crimes for each year
total_crimes_df = combined_wacrime_df.groupby('Year').sum(numeric_only=True, min_count=0)

# Print totals
total_crimes_df.sum(axis=1)

# Create a Total Crime column
total_crimes_df['Total Crimes'] = total_crimes_df.sum(axis=1)

# Format Column and print results                 
years_df = total_crimes_df.loc[:,['Total Crimes']].head(17).style.format("{:,.0f}")
years_df

In [45]:
year_line = total_crimes_df.plot(kind='line', color='green', grid=True, y='Total Crimes', title=('Crime Rate Statistics For Each Year (2007 - 2023)'))
year_line.set_ylabel('Total Crimes')
year_line.set_xlabel('Year')
plt.show

In [46]:
# Re-create graph in bar chart highlighting each Year
years = ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
total_crimes = total_crimes_df['Total Crimes']
x_axis = np.arange(0, len(years))
tick_locations = []
for x in x_axis:
    tick_locations.append(x)
plt.title('Crime Rate Statistics For Each Year (2007 - 2023)')
plt.xlabel('Year')
plt.ylabel('Total Crimes')

plt.xlim(-0.75, len(years)-.25)
plt.ylim(200000, max(total_crimes) + 10000)

plt.bar(x_axis, total_crimes, facecolor='lightblue', alpha=0.75, align='center')
plt.xticks(tick_locations, years, rotation=45, rotation_mode="anchor", ha="right", wrap=True)
plt.show()

In [49]:
# Drop year column
combined_wacrime_df = combined_wacrime_df.drop(columns=['Year'])

# Group the Months
months_wacrime_df = combined_wacrime_df.groupby(['Month'])
print(months_wacrime_df)

months_wacrime_df.sum().head(12)


In [50]:
# Total the number of crimes for each Month over the 17 year period
total_crimes_months_df = combined_wacrime_df.groupby('Month').sum(numeric_only=True, min_count=0)

# Print totals
total_crimes_months_df.sum(axis=1)

# Create a Total Crime column
total_crimes_months_df['Total Crimes'] = total_crimes_months_df.sum(axis=1)

# Reorder the Months
new_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
total_crimes_months_df = total_crimes_months_df.reindex(new_order, axis=0)

# Format Column and rows                  
months_df = total_crimes_months_df.loc[:,['Total Crimes']].head(12).style.format('{:,.0f}')

months_df


In [51]:
# Create line graph of results
month_plt = total_crimes_months_df.plot(kind='line', grid=True, color='blue', y='Total Crimes', title=('Crime Rate Statistics For Each Month Collectively (2007 - 2023)'))
month_plt.set_ylabel('Total Crimes')
month_plt.set_xlabel('Month')
month_plt.legend(loc='best')
plt.show


In [52]:
# Re-create graph in bar chart highlighting each month
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
total_crimes = total_crimes_months_df['Total Crimes']
x_axis = np.arange(0, len(months))
tick_locations = []
for x in x_axis:
    tick_locations.append(x)
plt.title('Crime Rate Statistics For Each Month Collectively (2007 - 2023)')
plt.xlabel("Month")
plt.ylabel('Total Crimes')

plt.xlim(-0.75, len(months)-.25)
plt.ylim(330000, max(total_crimes) + 10000)

plt.bar(x_axis, total_crimes, facecolor='purple', alpha=0.75, align='center')
plt.xticks(tick_locations, months)
plt.show()