# Statistical Analysis for Chicago crime dataset



# Data ingestion

In [1]:
# %pip install gdown dask pyarrow
# import dask.dataframe as dd

In [2]:
# # import lib for loading the dataset 
import gdown
import zipfile

# # Importing the dataset from google drive
# raw_link = "https://drive.google.com/file/d/1ib1PWK_3oaaSfThqfnfSoPZq7vA1g33X/view?usp=sharing"
# id = "1ib1PWK_3oaaSfThqfnfSoPZq7vA1g33X"
# file_path = "crime.zip"

# # Loading the dataset
# gdown.download(f"https://drive.google.com/uc?id={id}",file_path, quiet=False)

# uncomment this code to download the data.


In [3]:
# Data wrangling libraries
import pandas as pd
import numpy as np

# Data visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# lib for datetime
from datetime import datetime, timedelta

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Set up views
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [4]:
# Extracting and listing the files in the zipped dataset
# with zipfile.ZipFile(file_path, "r") as z:
#     # List files
#     print(z.namelist()) 
#     z.extractall("crime_dataset")


# Commenting this out because I have read/loaded the dataset to my workspace.
    

In [5]:
import pandas as pd
# Define data types to reduce memory usage
dtype_dict = {
    'ID' : 'Int32',
    'Case Number': 'string',
    'Block': 'string',
    'IUCR': 'category',
    'Primary Type': 'category',
    'Description': 'category',
    'Location Description': 'category',
    'Arrest': 'boolean',
    'Domestic': 'boolean',
    'Beat': 'Int64',
    'District': 'Int64',
    'Ward': 'Int64',
    'Community Area': 'Int64',
    'FBI Code': 'category',
    'X Coordinate': 'float32',
    'Y Coordinate': 'float32',
    'Year': 'float64',
    'Latitude': 'float32',
    'Longitude': 'float32',
    'Location': 'string'
}

# Loading the dataset with dask to handle the huge csv files
crime_data = pd.read_csv('crime_dataset/Crimes_-_2001_to_Present.csv',dtype=dtype_dict, parse_dates=['Date', 'Updated On'],date_format="%m/%d/%y %I:%M:%S %p",low_memory=False,keep_default_na=True)



# Print the first five rows
crime_data.head()




Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,10224738,HY411648,09/05/2015 01:30:00 PM,043XX S WOOD ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,9,12,61,08B,1165074.0,1875917.0,2015.0,02/10/2018 03:50:01 PM,41.815117,-87.669998,"(41.815117282, -87.669999562)"
1,10224739,HY411615,09/04/2015 11:30:00 AM,008XX N CENTRAL AVE,870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,15,29,25,06,1138875.0,1904869.0,2015.0,02/10/2018 03:50:01 PM,41.895081,-87.765404,"(41.895080471, -87.765400451)"
2,11646166,JC213529,09/01/2018 12:01:00 AM,082XX S INGLESIDE AVE,810,THEFT,OVER $500,RESIDENCE,False,True,631,6,8,44,06,,,2018.0,04/06/2019 04:04:43 PM,,,
3,10224740,HY411595,09/05/2015 12:45:00 PM,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,14,35,21,18,1152037.0,1920384.0,2015.0,02/10/2018 03:50:01 PM,41.937405,-87.716652,"(41.937405765, -87.716649687)"
4,10224741,HY411610,09/05/2015 01:00:00 PM,0000X N LARAMIE AVE,560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,15,28,25,08A,1141706.0,1900086.0,2015.0,02/10/2018 03:50:01 PM,41.881905,-87.755119,"(41.881903443, -87.755121152)"


In [6]:
# Creating a copy of the dataset
crime_dataset = crime_data.copy()
crime_dataset.tail()


Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
7784659,12847575,JF420478,09/01/2022 05:00:00 AM,005XX W SURF ST,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,1934,19,44,6,26,1172497.0,1919410.0,2022.0,01/03/2023 03:46:28 PM,41.934303,-87.641487,"(41.934304581, -87.641484982)"
7784660,12847801,JF420319,07/08/2022 12:00:00 AM,114XX S PRAIRIE AVE,1130,DECEPTIVE PRACTICE,FRAUD OR CONFIDENCE GAME,STREET,False,False,531,5,9,49,11,1179966.0,1828818.0,2022.0,01/03/2023 03:46:28 PM,41.685543,-87.616814,"(41.685543881, -87.616812541)"
7784661,12847324,JF420102,09/27/2022 11:00:00 AM,023XX E 70TH ST,0810,THEFT,OVER $500,RESIDENCE,False,False,331,3,5,43,6,1193181.0,1859005.0,2022.0,01/03/2023 03:46:28 PM,41.768066,-87.567451,"(41.768068052, -87.567452932)"
7784662,12847570,JF420427,09/03/2022 10:25:00 AM,052XX W CARMEN AVE,2021,NARCOTICS,POSSESS - BARBITURATES,RESIDENCE - YARD (FRONT / BACK),True,False,1623,16,45,11,18,1140553.0,1933418.0,2022.0,01/03/2023 03:46:28 PM,41.973392,-87.758537,"(41.973391184, -87.758534512)"
7784663,12840464,JF411839,09/26/2022 07:20:00 PM,0000X N MASON AVE,143A,WEAPONS VIOLATION,UNLAWFUL POSSESSION - HANDGUN,SIDEWALK,True,False,1513,15,29,25,15,1136773.0,1899652.0,2022.0,01/03/2023 03:46:28 PM,41.880802,-87.773247,"(41.880802263, -87.773245737)"


# Preliminary data analysis

In [7]:
# Converting the date to datetime
crime_dataset['Date'] = pd.to_datetime(crime_dataset['Date'])
crime_dataset['Date'].dtypes

dtype('<M8[ns]')

In [8]:
# Checking the data type
crime_data_type = crime_dataset.dtypes
print("Data type\n:", crime_data_type)

Data type
: ID                               Int32
Case Number             string[python]
Date                    datetime64[ns]
Block                   string[python]
IUCR                          category
Primary Type                  category
Description                   category
Location Description          category
Arrest                         boolean
Domestic                       boolean
Beat                             Int64
District                         Int64
Ward                             Int64
Community Area                   Int64
FBI Code                      category
X Coordinate                   float32
Y Coordinate                   float32
Year                           float64
Updated On                      object
Latitude                       float32
Longitude                      float32
Location                string[python]
dtype: object


In [9]:
# Printing the columns 
crime_data_cols = pd.read_csv('crime_dataset/Crimes_-_2001_to_Present.csv', nrows=5)
print(crime_data_cols.columns)


Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location'], dtype='object')


In [10]:
# Renaming the columns (removing whitespaces, convert to lower case and replace " " with ",") 
crime_dataset = crime_data.rename(columns=lambda x: x.strip().lower().replace(" ", "_"))
first_five_rows = crime_dataset.head()
print("First five rows in the dataset:", first_five_rows)

First five rows in the dataset:          id case_number                    date                  block  iucr primary_type              description location_description  arrest  domestic  beat  district  ward  community_area fbi_code  x_coordinate  y_coordinate    year              updated_on   latitude  longitude                       location
0  10224738    HY411648  09/05/2015 01:30:00 PM        043XX S WOOD ST  0486      BATTERY  DOMESTIC BATTERY SIMPLE            RESIDENCE   False      True   924         9    12              61      08B     1165074.0     1875917.0  2015.0  02/10/2018 03:50:01 PM  41.815117 -87.669998  (41.815117282, -87.669999562)
1  10224739    HY411615  09/04/2015 11:30:00 AM    008XX N CENTRAL AVE  0870        THEFT           POCKET-PICKING              CTA BUS   False     False  1511        15    29              25       06     1138875.0     1904869.0  2015.0  02/10/2018 03:50:01 PM  41.895081 -87.765404  (41.895080471, -87.765400451)
2  11646166    JC213529  0

In [11]:
# Checking the dataset for missing values
crime_dataset.isna().sum()

id                           0
case_number                  4
date                         0
block                        0
iucr                         0
primary_type                 0
description                  0
location_description     10381
arrest                       0
domestic                     0
beat                         0
district                    47
ward                    614848
community_area          613476
fbi_code                     0
x_coordinate             86848
y_coordinate             86848
year                         0
updated_on                   0
latitude                 86848
longitude                86848
location                 86848
dtype: int64

In [12]:
# Checking the dataset for duplicated values
duplicated = crime_dataset.duplicated().sum()
print("Duplicated values:", duplicated)

Duplicated values: 0


# Exploratory Data Analysis

In [13]:
# Convert the year column to Integer
crime_dataset['year'] = crime_dataset['year'].astype('Int64')

# Checking the number of years in the dataset
crime_dataset_years_no = crime_dataset['year'].nunique()
print(f"Chicago crime dataset for {crime_dataset_years_no} years\n")

# Checking the years in the dataset
crime_dataset_years = crime_dataset['year'].unique()
print(f"The years in the dataset are\n{crime_dataset_years}")

Chicago crime dataset for 23 years

The years in the dataset are
<IntegerArray>
[2015, 2018, 2016, 2014, 2001, 2020, 2019, 2021, 2012, 2017, 2013, 2011, 2007, 2003, 2010, 2008, 2009, 2002, 2005, 2006, 2004, 2023, 2022]
Length: 23, dtype: Int64


In [14]:
# Checking the size of the dataset
crime_dataset.shape

(7784664, 22)

In [15]:
# Checking the data type
crime_dataset.info

<bound method DataFrame.info of                id case_number                    date                  block  iucr        primary_type                    description             location_description  arrest  domestic  beat  district  ward  community_area fbi_code  x_coordinate  y_coordinate  year              updated_on   latitude  longitude                       location
0        10224738    HY411648  09/05/2015 01:30:00 PM        043XX S WOOD ST  0486             BATTERY        DOMESTIC BATTERY SIMPLE                        RESIDENCE   False      True   924         9    12              61      08B     1165074.0     1875917.0  2015  02/10/2018 03:50:01 PM  41.815117 -87.669998  (41.815117282, -87.669999562)
1        10224739    HY411615  09/04/2015 11:30:00 AM    008XX N CENTRAL AVE  0870               THEFT                 POCKET-PICKING                          CTA BUS   False     False  1511        15    29              25       06     1138875.0     1904869.0  2015  02/10/2018 03:5

# Descriptive Data Analysis

In [16]:
# Checking the Statistical summary of the data
# fetching numeric columns only

crime_data_numeric = crime_dataset.select_dtypes(include=['number']).columns
crime_data_numeric_summary = crime_dataset[crime_data_numeric].describe()
print("\nSummary Statistics:\n", crime_data_numeric_summary)



Summary Statistics:
                    id         beat   district       ward  community_area  x_coordinate  y_coordinate         year      latitude     longitude
count       7784664.0    7784664.0  7784617.0  7169816.0       7171188.0  7.697816e+06  7.697816e+06    7784664.0  7.697816e+06  7.697816e+06
mean   7026439.050899  1185.918271  11.294659  22.754197       37.484723  1.164601e+06  1.885783e+06  2009.944267  4.184219e+01 -8.767149e+01
std    3502656.769728    703.13349   6.952422  13.851058       21.541153  1.684658e+04  3.227531e+04     6.260628  8.879600e-02  6.108257e-02
min             634.0        111.0        1.0        1.0             0.0  0.000000e+00  0.000000e+00       2001.0  3.661945e+01 -9.168657e+01
25%         3792573.5        621.0        6.0       10.0            23.0  1.152976e+06  1.859073e+06       2005.0  4.176871e+01 -8.771367e+01
50%         7029327.0       1034.0       10.0       23.0            32.0  1.166110e+06  1.890730e+06       2009.0  4.185591e+0

In [17]:
# Extracting the year, month and day from the dataset
crime_dataset['date'] = pd.to_datetime(crime_dataset['date'])
crime_dataset['year'] = crime_dataset['date'].dt.year
crime_dataset['month'] = crime_dataset['date'].dt.month_name()
crime_dataset['day'] = crime_dataset['date'].dt.day_name()

crime_dataset.year

0          2015
1          2015
2          2018
3          2015
4          2015
           ... 
7784659    2022
7784660    2022
7784661    2022
7784662    2022
7784663    2022
Name: year, Length: 7784664, dtype: int32

In [18]:
# Setting the index using date
# crime_index =crime_dataset.set_index('date', inplace=True)
# crime_index

# Certain Crimes in the last ten Years

In [19]:
# Subsetting the crime for the last ten years
crime_dataset_ten_yrs = crime_dataset[crime_dataset['year'] >= 2013]
crime_dataset_ten_yrs.tail(1000)

Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location,month,day
7783662,12744927,JF297821,2022-06-28 19:07:00,012XX W GRENSHAW ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,1231,12,28,28,08B,1168451.0,1895145.0,2022,01/03/2023 03:46:28 PM,41.867809,-87.657059,"(41.8678085, -87.657056856)",June,Tuesday
7783663,12741708,JF293976,2022-06-25 19:40:00,028XX W 19TH ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,SIDEWALK,False,True,1022,10,12,30,08B,1157372.0,1890584.0,2022,01/03/2023 03:46:28 PM,41.855526,-87.697853,"(41.855524939, -87.6978538)",June,Saturday
7783664,12741082,JF293272,2022-06-24 17:30:00,088XX S CONSTANCE AVE,0820,THEFT,$500 AND UNDER,RESIDENCE,False,False,412,4,8,48,06,1190053.0,1846820.0,2022,01/03/2023 03:46:28 PM,41.734707,-87.579308,"(41.734707185, -87.579309729)",June,Friday
7783665,12743517,JF295891,2022-06-27 11:30:00,008XX W 115TH ST,0810,THEFT,OVER $500,PARKING LOT / GARAGE (NON RESIDENTIAL),False,False,524,5,34,53,06,1172764.0,1828586.0,2022,01/03/2023 03:46:28 PM,41.685070,-87.643181,"(41.685068588, -87.643184156)",June,Monday
7783666,12751422,JF304919,2022-07-04 18:30:00,005XX E 51ST ST,0810,THEFT,OVER $500,VACANT LOT / LAND,False,False,233,2,4,40,06,1180910.0,1871322.0,2022,01/03/2023 03:46:28 PM,41.802158,-87.612053,"(41.80215809, -87.612052585)",July,Monday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7784659,12847575,JF420478,2022-09-01 05:00:00,005XX W SURF ST,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,1934,19,44,6,26,1172497.0,1919410.0,2022,01/03/2023 03:46:28 PM,41.934303,-87.641487,"(41.934304581, -87.641484982)",September,Thursday
7784660,12847801,JF420319,2022-07-08 00:00:00,114XX S PRAIRIE AVE,1130,DECEPTIVE PRACTICE,FRAUD OR CONFIDENCE GAME,STREET,False,False,531,5,9,49,11,1179966.0,1828818.0,2022,01/03/2023 03:46:28 PM,41.685543,-87.616814,"(41.685543881, -87.616812541)",July,Friday
7784661,12847324,JF420102,2022-09-27 11:00:00,023XX E 70TH ST,0810,THEFT,OVER $500,RESIDENCE,False,False,331,3,5,43,06,1193181.0,1859005.0,2022,01/03/2023 03:46:28 PM,41.768066,-87.567451,"(41.768068052, -87.567452932)",September,Tuesday
7784662,12847570,JF420427,2022-09-03 10:25:00,052XX W CARMEN AVE,2021,NARCOTICS,POSSESS - BARBITURATES,RESIDENCE - YARD (FRONT / BACK),True,False,1623,16,45,11,18,1140553.0,1933418.0,2022,01/03/2023 03:46:28 PM,41.973392,-87.758537,"(41.973391184, -87.758534512)",September,Saturday


In [20]:
# Checking the number of crimes rate in the last ten years
crime_ten_yrs = crime_dataset_ten_yrs['primary_type'].nunique()
print(f"There were {crime_ten_yrs} crimes recorded in the last ten years\n")

# Checking the type of crimes rate in the last ten years
type_crime_ten_yrs = crime_dataset_ten_yrs['primary_type'].unique()
print(f"The types of crimes in the last ten years:\n {type_crime_ten_yrs}")

There were 35 crimes recorded in the last ten years

The types of crimes in the last ten years:
 ['BATTERY', 'THEFT', 'NARCOTICS', 'ASSAULT', 'BURGLARY', ..., 'OTHER NARCOTIC VIOLATION', 'NON-CRIMINAL', 'HOMICIDE', 'NON-CRIMINAL (SUBJECT SPECIFIED)', 'RITUALISM']
Length: 35
Categories (36, object): ['ARSON', 'ASSAULT', 'BATTERY', 'BURGLARY', ..., 'SEX OFFENSE', 'STALKING', 'THEFT', 'WEAPONS VIOLATION']


In [21]:
# # Checking the crimes in the last ten years
ten_yrs_crimes = crime_dataset_ten_yrs.index.value_counts()
ten_yrs_crimes

0          1
1          1
2          1
3          1
4          1
          ..
7784659    1
7784660    1
7784661    1
7784662    1
7784663    1
Name: count, Length: 2648847, dtype: int64

`Data Visualization for all the Crimes recorded in the last ten years`

In [None]:
# Plotting a barchart for the crimes committed in the last ten years
plt.figure(figsize=(20,10))
bars = plt.bar(ten_yrs_crimes.index, ten_yrs_crimes.values, color='red')
plt.xlabel('Crimes', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title("Crimes committed in the last ten Years(2013-2023)", weight='bold')
plt.xticks(rotation=90)
plt.bar_label(bars, fmt='%.0f', padding=3, fontsize=6)
plt.show()

In [None]:
# Top ten crimes in the last ten years
top_ten_crimes = crime_dataset_ten_yrs['primary_type'].value_counts().sort_values(ascending=False).head(10)
print(f"Top ten crimes in the last ten years:\n{top_ten_crimes}")

In [None]:
# Plotting a barchart for the crimes committed in the last ten years
plt.figure(figsize=(10,6))
bars = plt.bar(top_ten_crimes.index, top_ten_crimes.values, color='purple')
plt.xlabel('Crimes', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title("Top ten crimes in the last ten Years(2013-2023)", weight='bold')
plt.xticks(rotation=90)
plt.bar_label(bars, fmt='%.0f', padding=3, fontsize=6)
plt.show()

# Theft in the last ten years

In [None]:
# Selecting the theft in the last ten years
theft = top_ten_crimes.loc['THEFT']
theft


In [None]:
# Grouping theft by year for the last ten years
theft_ten_yrs = theft.groupby('year').size()
print(f"Theft recorded from 2013-2023:\n {theft_ten_yrs}")

In [None]:
# Plotting the theft recorded over the last ten years using barchart in Pandas
theft_ten_yrs.plot(kind='bar', xlabel='Year', ylabel='Frequency', title="Theft recorded in the last ten Years(2013-2023)", color='black')

In [None]:
# Grouping the Theft by location over the last ten years
theft_by_location = theft.groupby('block')['year'].size().sort_values(ascending=True)
theft_by_location

In [None]:
# Plotting the theft recorded over the last ten years using barchart in Pandas
# theft_by_location.plot(kind='bar', xlabel='Year', ylabel='Frequency', title="Theft recorded in the last ten Years(2013-2023)", color='black')
# Plotting a barchart for the crimes committed in the last ten years
plt.figure(figsize=(10,6))
plt.bar(theft_by_location.index, theft_by_location.values, color='red')
plt.xlabel('Location')
plt.ylabel('Year')
plt.title("Theft recorded by location in the last ten Years(2013-2023)")
plt.xticks(rotation=90)
plt.show()