## **Chicago Crime Data Analysis** ##

In [7]:
# Install required packages
# %pip install gdown dask pyarrow

## **Dataset Setup and Loading**

In [8]:
# Import the necessary libraries
import gdown
import zipfile


#libraries for data wrangling
import pandas as pd
import numpy as np

#libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns


# import dask.dataframe as dd

In [9]:
#set up views
pd.set_option('display.max_columns', None)
pd.set_option("display.width", 1000)

In [10]:
# Download dataset from Google Drive
# rawlink= "https://drive.google.com/file/d/1ib1PWK_3oaaSfThqfnfSoPZq7vA1g33X/view?usp=sharing"
# file_id = "1ib1PWK_3oaaSfThqfnfSoPZq7vA1g33X"
# output = "crime.zip"
# gdown.download(f"https://drive.google.com/uc?id={file_id}", output, quiet=False)

In [11]:
# with zipfile.ZipFile(output, "r") as z:
#     # List files
#     print(z.namelist())
#     z.extractall("crime_data")

In [None]:
# Explicitly defining data types for each column in the Crimes dataset to reduce memory usage
dtypes_dict = {
    'ID': 'Int32',                 
    'Case Number': 'string',        
    'Block': 'string',              
    'IUCR': 'string',              
    'Primary Type': 'category',    
    'Description': 'category',      
    'Location Description': 'category', 
    'Arrest': 'boolean',              
    'Domestic': 'boolean',             
    'Beat': 'Int16',                
    'District': 'Int8',             
    'Ward': 'Int8',                
    'Community Area': 'Int8',      
    'FBI Code': 'category',         
    'X Coordinate': 'float32',      
    'Y Coordinate': 'float32',     
    'Year': 'float64',             
    'Latitude': 'float32',         
    'Longitude': 'float32',        
    'Location': 'string'            
}

crime_data = pd.read_csv(
    "crime_data/Crimes_-_2001_to_Present.csv",
    dtype=dtypes_dict,
    parse_dates=['Date', 'Updated On'],
    date_format="%m/%d/%Y %I:%M:%S %p",
    keep_default_na=True,
    low_memory=False
)



# Preview the first 5 rows of the dataset
crime_data.head()


## **Preliminary Data Analysis**

In [None]:
# copying the data into a new variable for wrangling
crime_data2 = crime_data.copy()
crime_data2.head(1)

In [None]:
crime_data2.info()

In [None]:
# Checking the  data types
crime_data2.dtypes




In [None]:
# Check column names
crime_data2.columns

In [None]:
# Preview first 5 rows
crime_data2.head()


In [None]:
# Rename all columns to lowercase and replace spaces with underscores
crime_data2= crime_data2.rename(columns=lambda x: x.strip().lower().replace(" ", "_"))

In [None]:
# Preview updated column names
crime_data2.columns

In [None]:
crime_data2.head()

In [None]:
#Check for missing values in each column
crime_data.isnull().sum()


In [None]:
#Check for missing values in each column
crime_data.isna().sum()

In [None]:
# checking for duplicate values
crime_data2.duplicated().sum()

In [None]:

# checking for empty values
crime_data2.empty

## **Exploratory Data Analysis**

In [None]:
# checking the number of years of the data
crime_data2["year"] = crime_data2["year"].astype("int").astype("int")
crime_data2["year"].unique()

In [None]:
# checking the number of unique years
crime_data2["year"].nunique()

## Descriptive Analysis (Numerical variables)
- Using the .describe() function.

In [None]:
# Select numeric columns only 
numeric_cols = crime_data.select_dtypes(include=["number"]).columns
numeric_summary = crime_data[numeric_cols].astype("float64").describe()

print(numeric_summary)
# numeric_cols.describe()


In [None]:
crime_data2.shape

In [None]:
# # extracting the month and year from the date column
crime_data2["year"]= crime_data2['date'].dt.year
crime_data2["month"] = crime_data2["date"].dt.month_name()
crime_data2['day'] = crime_data2['date'].dt.day
crime_data2['hour'] = crime_data2['date'].dt.hour
crime_data2['day_of_week'] = crime_data2['date'].dt.dayofweek  # 0=Monday, 6=Sunday
crime_data2['day_name'] = crime_data2['date'].dt.day_name()
crime_data2['month_name'] = crime_data2['date'].dt.month_name()
crime_data2['quarter'] = crime_data2['date'].dt.quarter
crime_data2['is_weekend'] = crime_data2['day_of_week'].isin([5, 6])



# Time periods
crime_data2['time_period'] = pd.cut(crime_data2['hour'], 
                              bins=[0, 6, 12, 18, 24], 
                              labels=['Night', 'Morning', 'Afternoon', 'Evening'],
                              include_lowest=True)
print("Data Extracted Successfully,New Columns added\n")
crime_data2.head()

In [None]:
crime_data2.year

In [None]:
# checking the columns in the dataset for the new columns added
crime_data2.columns()

# Certain Crimes in the last ten years

In [None]:
# Filter the dataset for the last 10 years (2013–2023)
crime_ten_yrs = crime_data2[crime_data2["year"] >= 2013]
crime_ten_yrs.tail(1000)

In [None]:
# Setting 'primary_type' as the index
# crime_ten_yrs = crime_ten_yrs.set_index("primary_type")

In [None]:

# Count how many unique crime types exist in the last 10years
unique_count = crime_ten_yrs["primary_type"].nunique()
print("\nThe Number of unique crime types recorded in (2013–2023):", unique_count)


In [None]:
# Get the unique crime types within the last 10 years
unique_crime_types = crime_ten_yrs["primary_type"].unique()
print("The Unique crime types recorded from 2013–2023:\n", unique_crime_types)


In [None]:

# Getting the number of counts of each crime type in the last 10 years
crime_type_counts = crime_ten_yrs["primary_type"].value_counts()
print("\nCrime type counts (2013–2023):\n", crime_type_counts)

# Data Visualization for all the crimes recorded in the dataset

In [None]:
# top 
# Count all crime types from the index
crime_type_counts = crime_ten_yrs.index.value_counts()
print("\nCrime type counts (2013–2023):\n", crime_type_counts)

In [None]:
# Create vertical bar chart
plt.figure(figsize=(20,10))
bars = plt.bar(crime_type_counts.index, crime_type_counts.values, color="#1f77b4")

# Add chart title and labels
plt.title("All Crime Types Recorded (2013–2023)", fontsize=16, weight="bold")
plt.xlabel("Crime Type", fontsize=12)
plt.ylabel("Number of Records", fontsize=12)

# Rotate x labels for readability
plt.xticks(rotation=75, ha="right")

# Add labels on top of bars
plt.bar_label(bars, fmt="%.0f", padding=3)

# Show chart
plt.tight_layout()
plt.show()# 

# Top Ten Crimes in the Last Ten Years (2013-2023)

In [None]:
# Sort crime counts and take Top 10
top_crimes = crime_ten_yrs['primary_type'].value_counts().sort_values(ascending=False).head(10)
print("Top ten crimes in the last ten years\n",top_crimes)

In [None]:
top_crimes = crime_ten_yrs['primary_type'].value

In [None]:
# Create vertical bar chart
plt.figure(figsize=(10,6))
bars = plt.bar(top_crimes.index, top_crimes.values, color="#47b41f")

# Add chart title and labels
plt.title("Top 10 Crime Types Recorded (2013–2023)", fontsize=16, weight="bold")
plt.xlabel("Crime Type", fontsize=12)
plt.ylabel("Number of Records", fontsize=12)

# Rotate x labels
plt.xticks(rotation=45, ha="right")

# Add value labels on top of bars
plt.bar_label(bars, fmt="%.0f", padding=3)

# displaying chart
plt.show()

**Theft (2013-2023)**

In [None]:
# selecting the theft in the last ten years
theft = crime_ten_yrs.loc["THEFT"]

In [None]:
# Getting the number of counts of each crime type in the last 10 years
theft_ten_yrs = crime_ten_yrs[crime_ten_yrs["primary_type"] == "THEFT"].value_counts()
print("\nTheft recorded from (2013–2023):\n", theft_ten_yrs)

In [None]:
# brouping the theft by year
theft_grouped_by_10yr = theft_ten_yrs.groupby("year").sum().sort_values(ascending= True)
theft_grouped_by_10yr

In [None]:
#Select only the THEFT crimes
# theft_crime = crime_ten_yrs.loc["THEFT"]

In [None]:
# #Grouping by year and count
# theft_by_year = theft_crime.groupby("year").size()

# # Plotting the line chart
# plt.figure(figsize=(10,6))
# sns.lineplot(x=theft_by_year.index, y=theft_by_year.values, 
#              marker="o", color="crimson", linewidth=2)

# # Adding the chart details
# plt.title("Theft Cases Recorded (2013–2023)", fontsize=16, weight="bold")
# plt.xlabel("Year", fontsize=12)
# plt.ylabel("Number of Thefts", fontsize=12)

# # Add grid 
# plt.grid(True, linestyle="--", alpha=0.6)

# plt.show()