# Statistical Analysis for Chicago crime dataset



# Data ingestion

In [None]:
# %pip install gdown dask pyarrow
# import dask.dataframe as dd

In [None]:
# Data wrangling libraries
import pandas as pd
import numpy as np

# Data visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# lib for datetime
from datetime import datetime, timedelta

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Set up views
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [None]:
# # # import lib for loading the dataset 
# import gdown
# import zipfile

# # Importing the dataset from google drive
# raw_link = "_https://www.kaggle.com/datasets/utkarshx27/crimes-2001-to-present?resource=download_"
# id = "1ib1PWK_3oaaSfThqfnfSoPZq7vA1g33X"
# file_path = "crime.zip"

# # Loading the dataset
# gdown.download(f"https://drive.google.com/uc?id={id}",file_path, quiet=False)

# uncomment this code to download the data.


In [None]:
# import lib for loading the dataset 

import kagglehub
from pathlib import Path

# Step 1: Download the dataset from Kaggle using kagglehub
# Dataset: https://www.kaggle.com/datasets/utkarshx27/crimes-2001-to-present

dataset_path = Path(kagglehub.dataset_download("utkarshx27/crimes-2001-to-present"))

print("Dataset Path", dataset_path)



# Step 2: Define the path to the actual CSV file
csv_file = dataset_path / "Crimes_-_2001_to_Present.csv"

In [None]:
csv_file = r"C:\Users\Noimot\.cache\kagglehub\datasets\utkarshx27\crimes-2001-to-present\versions\1\Crimes_-_2001_to_Present.csv"

In [None]:
# crime_data = pd.read_csv(csv_file, low_memory=True)

In [None]:
# Extracting and listing the files in the zipped dataset
# with zipfile.ZipFile(file_path, "r") as z:
#     # List files
#     print(z.namelist()) 
#     z.extractall("crime_dataset")


# Commenting this out because I have read/loaded the dataset to my workspace.
    

In [None]:
import pandas as pd
# Define data types to reduce memory usage
dtype_dict = {
    'ID' : 'Int32',
    'Case Number': 'string',
    'Block': 'string',
    'IUCR': 'category',
    'Primary Type': 'category',
    'Description': 'category',
    'Location Description': 'category',
    'Arrest': 'boolean',
    'Domestic': 'boolean',
    'Beat': 'Int64',
    'District': 'Int64',
    'Ward': 'Int64',
    'Community Area': 'Int64',
    'FBI Code': 'category',
    'X Coordinate': 'float32',
    'Y Coordinate': 'float32',
    'Year': 'float64',
    'Latitude': 'float32',
    'Longitude': 'float32',
    'Location': 'string'
}

# Loading the dataset with dask to handle the huge csv files
crime_data = pd.read_csv(csv_file,dtype=dtype_dict, parse_dates=['Date', 'Updated On'],date_format="%m/%d/%y %I:%M:%S %p",low_memory=False,keep_default_na=True)



# Print the first five rows
crime_data.head()




In [None]:
# Creating a copy of the dataset
crime_dataset = crime_data.copy()
crime_dataset.tail()


# Preliminary data analysis

In [None]:
# Converting the date to datetime
crime_dataset['Date'] = pd.to_datetime(crime_dataset['Date'])
crime_dataset['Date'].dtypes

In [None]:
# Checking the data type
crime_data_type = crime_dataset.dtypes
print("Data type\n:", crime_data_type)

In [None]:
# Printing the columns 
crime_data_cols = pd.read_csv('crime_dataset/Crimes_-_2001_to_Present.csv', nrows=5)
print(crime_data_cols.columns)


In [None]:
# Renaming the columns (removing whitespaces, convert to lower case and replace " " with ",") 
crime_dataset = crime_data.rename(columns=lambda x: x.strip().lower().replace(" ", "_"))
first_five_rows = crime_dataset.head()
print("First five rows in the dataset:\n", first_five_rows)

In [None]:
# Checking the dataset for missing values
crime_dataset.isna().sum()

In [None]:
# Checking the dataset for duplicated values
duplicated = crime_dataset.duplicated().sum()
print("Duplicated values:", duplicated)

# Exploratory Data Analysis

In [None]:
# Convert the year column to Integer
crime_dataset['year'] = crime_dataset['year'].astype('Int64')

# Checking the number of years in the dataset
crime_dataset_years_no = crime_dataset['year'].nunique()
print(f"Chicago crime dataset for {crime_dataset_years_no} years\n")

# Checking the years in the dataset
crime_dataset_years = crime_dataset['year'].unique()
print(f"The years in the dataset are\n{crime_dataset_years}")

In [None]:
# Checking the size of the dataset
crime_dataset.shape

In [None]:
# Checking the data type
crime_dataset.info

# Descriptive Data Analysis

In [None]:
# Checking the Statistical summary of the data
# fetching numeric columns only

crime_data_numeric = crime_dataset.select_dtypes(include=['number']).columns
crime_data_numeric_summary = crime_dataset[crime_data_numeric].describe()
print("\nSummary Statistics:\n", crime_data_numeric_summary)


In [None]:
# Extracting the year, month and day from the dataset
crime_dataset['date'] = pd.to_datetime(crime_dataset['date'])
crime_dataset['year'] = crime_dataset['date'].dt.year
crime_dataset['month'] = crime_dataset['date'].dt.month_name()
crime_dataset['day'] = crime_dataset['date'].dt.day_name()

crime_dataset.head()


In [None]:
# # Setting the index using date
# crime_index =crime_dataset.set_index('primary_type', inplace=True)
# crime_index

# Certain Crimes in the last ten Years

In [None]:
# Subsetting the crime for the last ten years
crime_dataset_ten_yrs = crime_dataset[crime_dataset['year'] >= 2013]
crime_dataset_ten_yrs.tail(1000)

In [None]:
# Checking the number of crimes rate in the last ten years
crime_ten_yrs = crime_dataset_ten_yrs['primary_type'].nunique()
print(f"There were {crime_ten_yrs} crimes recorded in the last ten years\n")

# # Checking the type of crimes rate in the last ten years
type_crime_ten_yrs = crime_dataset_ten_yrs['primary_type'].unique()
print(f"The types of crimes in the last ten years:\n {type_crime_ten_yrs}")

In [None]:
# Checking the crimes in the last ten years
ten_yrs_crimes = crime_dataset_ten_yrs.index.value_counts().head(100)
ten_yrs_crimes

`Data Visualization for all the Crimes recorded in the last ten years`

In [None]:
# Plotting a barchart for the crimes committed in the last ten years
plt.figure(figsize=(10,6))
plt.bar(ten_yrs_crimes.index, ten_yrs_crimes.values, color='red')
plt.xlabel('Crimes')
plt.ylabel('Frequency')
plt.title("Crimes committed in the last ten Years(2013-2023)")
plt.xticks(rotation=90)
plt.show()

# Theft in the last ten years

In [None]:
# Selecting the theft in the last ten years
theft = crime_dataset_ten_yrs.loc['THEFT']
print(f"Number of theft recorded from 2013-2023): {theft}")

In [None]:
# Grouping theft by year for the last ten years
theft_ten_yrs = theft.groupby('year').size().sort_values(ascending=True)
print(f"Theft recorded from 2013-2023:\n {theft_ten_yrs}")

In [None]:
# Plotting the theft recorded over the last ten years using barchart in Pandas
theft_ten_yrs.plot(kind='bar', xlabel='Year', ylabel='Frequency', title="Theft recorded in the last ten Years(2013-2023)", color='black')

In [None]:
# Grouping the Theft by location over the last ten years
theft_by_location = theft.groupby('block')['year'].size().sort_values(ascending=True)
theft_by_location

In [None]:
# Plotting the theft recorded over the last ten years using barchart in Pandas
# theft_by_location.plot(kind='bar', xlabel='Year', ylabel='Frequency', title="Theft recorded in the last ten Years(2013-2023)", color='black')
# Plotting a barchart for the crimes committed in the last ten years
plt.figure(figsize=(10,6))
plt.bar(theft_by_location.index, theft_by_location.values, color='red')
plt.xlabel('Location')
plt.ylabel('Year')
plt.title("Theft recorded by location in the last ten Years(2013-2023)")
plt.xticks(rotation=90)
plt.show()