# Statistical Analysis for Chicago crime dataset



# Data ingestion

In [1]:
# %pip install gdown dask pyarrow
# import dask.dataframe as dd

In [2]:
# # import lib for loading the dataset 
# import gdown
# import zipfile

# # Importing the dataset from google drive
# raw_link = "https://drive.google.com/file/d/1ib1PWK_3oaaSfThqfnfSoPZq7vA1g33X/view?usp=sharing"
# id = "1ib1PWK_3oaaSfThqfnfSoPZq7vA1g33X"
# file_path = "crime.zip"

# # Loading the dataset
# gdown.download(f"https://drive.google.com/uc?id={id}",file_path, quiet=False)

# uncomment this code to download the data.


In [3]:
# Data wrangling libraries
import pandas as pd
import numpy as np

# Data visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# lib for datetime
from datetime import datetime, timedelta

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Set up views
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [4]:
# Extracting and listing the files in the zipped dataset
# with zipfile.ZipFile(file_path, "r") as z:
#     # List files
#     print(z.namelist()) 
#     z.extractall("crime_dataset")


# Commenting this out because I have read/loaded the dataset to my workspace.
    

In [None]:
import pandas as pd
# Define data types to reduce memory usage
dtype_dict = {
    'ID' : 'Int32',
    'Case Number': 'string',
    'Block': 'string',
    'IUCR': 'category',
    'Primary Type': 'category',
    'Description': 'category',
    'Location Description': 'category',
    'Arrest': 'boolean',
    'Domestic': 'boolean',
    'Beat': 'Int64',
    'District': 'Int64',
    'Ward': 'Int64',
    'Community Area': 'Int64',
    'FBI Code': 'category',
    'X Coordinate': 'float32',
    'Y Coordinate': 'float32',
    'Year': 'float64',
    'Latitude': 'float32',
    'Longitude': 'float32',
    'Location': 'string'
}

# Loading the dataset with dask to handle the huge csv files
crime_data = pd.read_csv('crime_dataset/Crimes_-_2001_to_Present.csv',dtype=dtype_dict, parse_dates=['Date', 'Updated On'],date_format="%m/%d/%y %I:%M:%S %p",low_memory=False,keep_default_na=True)



# Print the first five rows
crime_data.head()




In [None]:
# Creating a copy of the dataset
crime_dataset = crime_data.copy()
crime_dataset.head()


# Preliminary data analysis

In [None]:
# Converting the date to datetime
crime_dataset['Date'] = pd.to_datetime(crime_dataset['Date'])
crime_dataset['Date'].dtypes

In [None]:
# Checking the data type
crime_data_type = crime_dataset.dtypes
print("Data type\n:", crime_data_type)

In [None]:
# Printing the columns 
crime_data_cols = pd.read_csv('crime_dataset/Crimes_-_2001_to_Present.csv', nrows=5)
print(crime_data_cols.columns)


In [None]:
# Renaming the columns (removing whitespaces, convert to lower case and replace " " with ",") 
crime_dataset = crime_data.rename(columns=lambda x: x.strip().lower().replace(" ", "_"))
first_five_rows = crime_dataset.head()
print("First five rows in the dataset:", first_five_rows)

In [None]:
# Checking the dataset for missing values
crime_dataset.isna().sum()

In [None]:
# Checking the dataset for duplicated values
duplicated = crime_dataset.duplicated().sum()
print("Duplicated values:", duplicated)

# Exploratory Data Analysis

In [None]:
# Convert the year column to Integer
crime_dataset['year'] = crime_dataset['year'].astype('Int64')

# Checking the number of years in the dataset
crime_dataset_years_no = crime_dataset['year'].nunique()
print(f"Chicago crime dataset for {crime_dataset_years_no} years\n")

# Checking the years in the dataset
crime_dataset_years = crime_dataset['year'].unique()
print(f"The years in the dataset are\n{crime_dataset_years}")

# Descriptive Data Analysis

In [None]:
# Checking the Statistical summary of the data
# fetching numeric columns only

crime_data_numeric = crime_dataset.select_dtypes(include=['number']).columns
crime_data_numeric_summary = crime_dataset[crime_data_numeric].describe()