## **Chicago Crime Data Analysis** ##

In [None]:
# Install required packages
# %pip install gdown dask pyarrow

## **Dataset Setup and Loading**

In [None]:
# Import the necessary libraries
import gdown
import zipfile


#libraries for data wrangling
import pandas as pd
import numpy as np

#libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns


# import dask.dataframe as dd

In [None]:
#set up views
pd.set_option('display.max_columns', None)
pd.set_option("display.width", 1000)

In [None]:
# Download dataset from Google Drive
# rawlink= "https://drive.google.com/file/d/1ib1PWK_3oaaSfThqfnfSoPZq7vA1g33X/view?usp=sharing"
# file_id = "1ib1PWK_3oaaSfThqfnfSoPZq7vA1g33X"
# output = "crime.zip"
# gdown.download(f"https://drive.google.com/uc?id={file_id}", output, quiet=False)

In [None]:
# with zipfile.ZipFile(output, "r") as z:
#     # List files
#     print(z.namelist())
#     z.extractall("crime_data")

In [None]:
# Explicitly defining data types for each column in the Crimes dataset to reduce memory usage
dtypes_dict = {
    'ID': 'Int32',                 
    'Case Number': 'string',        
    'Block': 'string',              
    'IUCR': 'string',              
    'Primary Type': 'category',    
    'Description': 'category',      
    'Location Description': 'category', 
    'Arrest': 'boolean',              
    'Domestic': 'boolean',             
    'Beat': 'Int16',                
    'District': 'Int8',             
    'Ward': 'Int8',                
    'Community Area': 'Int8',      
    'FBI Code': 'category',         
    'X Coordinate': 'float32',      
    'Y Coordinate': 'float32',     
    'Year': 'float64',             
    'Latitude': 'float32',         
    'Longitude': 'float32',        
    'Location': 'string'            
}

crime_data = pd.read_csv(
    "crime_data/Crimes_-_2001_to_Present.csv",
    dtype=dtypes_dict,
    parse_dates=['Date', 'Updated On'],
    date_format="%m/%d/%Y %I:%M:%S %p",
    keep_default_na=True,
    low_memory=False
)



# Preview the first 5 rows of the dataset
crime_data.head()


## **Preliminary Data Analysis**

In [None]:
# copying the data into a new variable for wrangling
crime_data2 = crime_data.copy()
crime_data2.head(1)

In [None]:
crime_data2.info()

In [None]:
# Checking the  data types
crime_data2.dtypes

In [None]:
# Check column names
crime_data2.columns

In [None]:
# Preview first 5 rows
crime_data2.head()


In [None]:
# Rename all columns to lowercase and replace spaces with underscores
crime_data2= crime_data2.rename(columns=lambda x: x.strip().lower().replace(" ", "_"))

In [None]:
# Preview updated column names
crime_data2.columns

In [None]:
# Converting the date to datetime
crime_data2['date'] = pd.to_datetime(crime_data2['date'])

In [None]:
crime_data_type= crime_data2['date'].dtypes
print("Data type\n:", crime_data_type)

In [None]:
crime_data2['updated_on'] = pd.to_datetime(crime_data2['updated_on'])

In [None]:
updated_date_data_type= crime_data2['updated_on'].dtypes
print("Updated Date Data type\n:", updated_date_data_type)

In [None]:
crime_data2.head()

**Analyzing the missing data**

In [None]:
# Defining a function to analyze the mssing data
def missing_data(crime_data2):
    """Analyzing missing data in the dataset"""

    missing_data = pd.DataFrame({
            'Columns': crime_data2.columns,
            'Missing_count': crime_data2.isnull().sum(),
            'Missing_percentage': (crime_data2.isnull().sum()/ len(crime_data2) * 100),
            'Data_type': crime_data2.dtypes
    })

    missing_data = missing_data[missing_data['Missing_count'] > 0]. sort_values('Missing_percentage', ascending=False)

    print('_____________________\n')
    print('Missing DATA ANALYSIS')
    print('_____________________')

    print(missing_data)
    # Visualizing missing data
    plt.figure(figsize=(20,10))
    plt.bar(missing_data['Columns'], missing_data['Missing_percentage'])
    plt.title('Missing data by column')
    plt.xlabel('Columns')
    plt.ylabel('Missing Percentage (%)')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

missing_data(crime_data2)

In [None]:
# checking for missing values
crime_data2.isnull().sum()

In [None]:
# Dropping the missing values
crime_data2.dropna(inplace=True)


In [None]:
#Check for missing values in each column
crime_data.isna().sum()

In [None]:
# checking for duplicate values
crime_data2.duplicated().sum()

In [None]:

# checking for empty values
crime_data2.empty

## **Exploratory Data Analysis**

In [None]:
# checking the number of years of the data and converting the year column to Integer
crime_data2["year"] = crime_data2["year"].astype("int").astype("int")

In [None]:
# checking the years in the dataset
crime_data_years= crime_data2["year"].unique()
print(f"The years in the dataset are\n{crime_data_years}")

In [None]:
# checking the number of unique years in the data
crime_data_years_no= crime_data2["year"].nunique()
print(f"Chicago crime dataset for {crime_data_years_no} years\n")

In [None]:
# Checking the size of the dataset
crime_data2.shape

In [None]:
crime_data2.info

## Descriptive Analysis (Numerical variables)
- Using the .describe() function.

In [None]:
# Select numeric columns only 
numeric_cols = crime_data.select_dtypes(include=["number"]).columns

# checking the statistical summary of the data
numeric_summary = crime_data[numeric_cols].astype("float64").describe()

print("Numeric Summary",numeric_summary)



In [None]:
# # extracting the year, month, week, day, hour, quarter and weekends from the dataset 
crime_data2["year"]= crime_data2['date'].dt.year
crime_data2["month"] = crime_data2["date"].dt.month_name()
crime_data2['day'] = crime_data2['date'].dt.day
crime_data2['hour'] = crime_data2['date'].dt.hour
crime_data2['day_of_week'] = crime_data2['date'].dt.dayofweek  # 0=Monday, 6=Sunday
crime_data2['day_name'] = crime_data2['date'].dt.day_name()
crime_data2['month_name'] = crime_data2['date'].dt.month_name()
crime_data2['quarter'] = crime_data2['date'].dt.quarter
crime_data2['weekend'] = crime_data2['day_of_week'].isin([5, 6])#Saturday and Sunday

# Time periods
crime_data2['time_period'] = pd.cut(crime_data2['hour'], 
                              bins=[0, 6, 12, 18, 24], 
                              labels=['Night', 'Morning', 'Afternoon', 'Evening'],
                              include_lowest=True)

# Applying the new date features to the dataset
print("Data Extracted Successfully,New Columns added\n")
crime_data2.head()

In [None]:
# checking the columns in the dataset for the new columns added
crime_data2.columns

# Certain Crimes in the last ten years

In [None]:
# Filter the dataset for the last 10 years (2013–2023)
crime_ten_yrs = crime_data2[crime_data2["year"] >= 2013]
crime_ten_yrs.tail(1000)

In [None]:
# Setting 'primary_type' as the index
# crime_ten_yrs = crime_ten_yrs.set_index("primary_type")

In [None]:

# Count how many unique crime types exist in the last 10years
unique_crime_ten_yrs = crime_ten_yrs["primary_type"].nunique()
print("\nThe Number of unique crime types recorded in (2013–2023):", unique_crime_ten_yrs)


In [None]:
# Get the unique crime types within the last 10 years
unique_crime_types_ten_yrs = crime_ten_yrs["primary_type"].unique()
print("The Unique crime types recorded from 2013–2023:\n", unique_crime_types_ten_yrs)

In [None]:

# Getting the number of counts of each crime type in the last 10 years
crime_type_counts_10 = crime_ten_yrs["primary_type"].value_counts()
print("\nCrime type counts (2013–2023):\n", crime_type_counts_10)

# Data Visualization for all the crimes recorded in the last ten years

In [None]:
# Plotting a barchart for the crimes committed in the last ten years
plt.figure(figsize=(20,10))
bars = plt.bar(crime_ten_yrs.index, crime_ten_yrs.values, color='red')
plt.xlabel('Crimes', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title("Crimes committed in the last ten Years(2013-2023)", weight='bold')
plt.xticks(rotation=90)
plt.bar_label(bars, fmt='%.0f', padding=3, fontsize=6)
plt.show()

In [None]:
# top 
# Count all crime types from the index
crime_type_counts = crime_ten_yrs.index.value_counts()
print("\nCrime type counts (2013–2023):\n", crime_type_counts)

In [None]:
# Create vertical bar chart
plt.figure(figsize=(20,10))
bars = plt.bar(crime_type_counts.index, crime_type_counts.values, color="#1f77b4")

# Add chart title and labels
plt.title("All Crime Types Recorded (2013–2023)", fontsize=16, weight="bold")
plt.xlabel("Crime Type", fontsize=12)
plt.ylabel("Number of Records", fontsize=12)

# Rotate x labels for readability
plt.xticks(rotation=75, ha="right")

# Add labels on top of bars
plt.bar_label(bars, fmt="%.0f", padding=3)

# Show chart
plt.tight_layout()
plt.show()# 

# Top Ten Crimes in the Last Ten Years (2013-2023)

In [None]:
# Sort crime counts and take Top 10
top_crimes = crime_ten_yrs['primary_type'].value_counts().sort_values(ascending=False).head(10)
print("Top ten crimes in the last ten years\n",top_crimes)

In [None]:
# Create vertical bar chart
plt.figure(figsize=(10,6))
bars = plt.bar(top_crimes.index, top_crimes.values, color="#47b41f")

# Add chart title and labels
plt.title("Top 10 Crime Types Recorded (2013–2023)", fontsize=16, weight="bold")
plt.xlabel("Crime Type", fontsize=12)
plt.ylabel("Number of Records", fontsize=12)

# Rotate x labels
plt.xticks(rotation=45, ha="right")

# Add value labels on top of bars
plt.bar_label(bars, fmt="%.0f", padding=3)

# displaying chart
plt.show()

**Theft (2013-2023)**

In [None]:
# selecting the theft in the last ten years
theft = top_crimes.loc["THEFT"]

In [None]:
# Getting the number of counts of each crime theft in the last 10 years
theft_ten_yrs = crime_ten_yrs[crime_ten_yrs["primary_type"] == "THEFT"].value_counts()
print("\nTheft recorded from (2013–2023):\n", theft_ten_yrs)

In [None]:
# grouping the theft by year
theft_grouped_by_10yr = theft_ten_yrs.groupby("year").sum().sort_values(ascending= True)
print("Theft by Year:\n",theft_grouped_by_10yr)

In [None]:
#Grouping by year and count
theft_by_year = theft.groupby("year").size()

# Plotting the line chart
plt.figure(figsize=(10,6))
sns.lineplot(x=theft_by_year.index, y=theft_by_year.values, 
             marker="o", color="crimson", linewidth=2)

# Adding the chart details
plt.title("Theft Cases Recorded (2013–2023)", fontsize=16, weight="bold")
plt.xlabel("Year", fontsize=12)
plt.ylabel("Number of Thefts", fontsize=12)

# Add grid 
plt.grid(True, linestyle="--", alpha=0.6)

plt.show()

**BATTERY (2013-2023)**

In [None]:
# Checking the Battery in the last ten years
battery = top_crimes.loc['BATTERY']
print(f"Total BATTERY crime recorded: {battery}\n")# 

# Checking the battery in the last ten years
battery_ten_yrs = crime_ten_yrs[crime_ten_yrs['primary_type']=='BATTERY'].value_counts(ascending=True)
print(f"Battery recorded from 2013-2023\n {battery_ten_yrs}\n")

# Grouping the Battery by Year
battery_grouped_by_10years = battery_ten_yrs.groupby('year').sum().sort_values(ascending=True)
print("Battery by Year:\n",battery_grouped_by_10years)

**CRIMINAL DAMAGE (2013-2023)**

In [None]:
# Checking the criminal_damagein the last ten years
criminal_damage = top_crimes.loc['CRIMINAL DAMAGE']
print(f"Total Criminal damage recorded: {criminal_damage}\n")

# Checking the criminal_damage in the last ten years
criminal_damage_ten_yrs = crime_ten_yrs[crime_ten_yrs['primary_type']=='CRIMINAL DAMAGE'].value_counts(ascending=True)
print(f"Criminal damage recorded from 2013-2023\n {criminal_damage_ten_yrs}\n")

# Grouping the criminal_damage by Year
criminal_damage_grouped_by_10years = criminal_damage_ten_yrs.groupby('year').sum().sort_values(ascending=True)
print("Criminal damage by Year:\n",criminal_damage_grouped_by_10years)

**ASSAULT (2013-2023)**

In [None]:
# Checking the Assault in the last ten years
assault =top_crimes.loc['ASSAULT']
print(f"Total Assault recorded: {assault}\n")

# Checking the Assault in the last ten years
assault_ten_yrs = crime_ten_yrs[crime_ten_yrs['primary_type']=='ASSAULT'].value_counts(ascending=True)
print(f"Assault recorded from 2013-2023\n {assault_ten_yrs}\n")

# Grouping the Assault by Year
assault_grouped_by_10years = assault_ten_yrs.groupby('year').sum().sort_values(ascending=True)
print("Assault by Year:\n",assault_grouped_by_10years)

**OTHER OFFENSE (2013-2023)**

In [None]:
# Checking the other offence in the last ten years
other_offence = top_crimes.loc['ASSAULT']
print(f"Total other offences recorded: {other_offence}\n")

# Checking the other offence  in the last ten years
other_off_ten_yrs = crime_ten_yrs[crime_ten_yrs['primary_type']=='ASSAULT'].value_counts(ascending=True)
print(f"Other offence  recorded from 2013-2023\n {other_off_ten_yrs}\n")

other_off_grouped_by_10years = assault_ten_yrs.groupby('year').sum().sort_values(ascending=True)
print("Other offence  by Year:\n", other_off_grouped_by_10years)

**DECEPTIVE PRACTICE (2013-2023)**

In [None]:
# Checking the Deceptive Practice in the last ten years
decep_practice = top_crimes.loc['ASSAULT']
print(f"Total Deceptive Practice recorded: {decep_practice }\n")

# Checking the other offence  in the last ten years
decep_practice_ten_yrs = crime_ten_yrs[crime_ten_yrs['primary_type']=='ASSAULT'].value_counts(ascending=True)
print(f"Deceptive Practice recorded from 2013-2023\n {decep_practice_ten_yrs}\n")

decep_practice_grouped_by_10years = assault_ten_yrs.groupby('year').sum().sort_values(ascending=True)
print("Deceptive Practice  by Year:\n", decep_practice_grouped_by_10years)

In [None]:
# Plotting all the top ten crimes recorded over the last ten years using matplolib
def create_crime_visualizations(crime_ten_yrs):
    
    """ create visualizations"""
    import matplotlib.pyplot as plt
    plt.style.use('seaborn-v0_8')
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Chicago Crime Data Analysis Dashboard', fontsize=16, fontweight='bold')

    # 1. Theft recorded over ten years
    theft_grouped_by_10yr.plot(kind='bar', ax=axes[0,0], color='red')
    axes[0,0].set_title('Theft over ten years')
    axes[0,0].set_xlabel('Theft by Year')
    axes[0,0].set_ylabel('Frequency')
    axes[0,0].grid(True, alpha=0.3)

    # 2. Battery recorded over ten years
    battery_grouped_by_10years.plot(kind='barh', ax=axes[0,1], color='black')
    axes[0,1].set_title('Battery over ten years')
    axes[0,1].set_xlabel('Battery by Year')
    axes[0,1].set_ylabel('Frequency')
    axes[0,1].grid(True, alpha=0.3)

    # 3. Criminal damage recorded over ten years
    criminal_damage_grouped_by_10years.plot(kind='barh', ax=axes[0,2], color='orange')
    axes[0,2].set_title('Criminal damage over ten years')
    axes[0,2].set_xlabel('Criminal damage by Year')
    axes[0,2].set_ylabel('Frequency')
    axes[0,2].grid(True, alpha=0.3)
    
    # 4. Assault recorded over ten years
    assault_grouped_by_10years.plot(kind='barh', ax=axes[1,0], color='blue')
    axes[1,0].set_title('Assault damage over ten years')
    axes[1,0].set_xlabel('Assault damage by Year')
    axes[1,0].set_ylabel('Frequency')
    axes[1,0].grid(True, alpha=0.3)

    # 5. Othher Offence recorded over ten years
    other_off_grouped_by_10years.plot(kind='barh', ax=axes[1,1], color='green')
    axes[1,1].set_title('Other Offence over ten years')
    axes[1,1].set_xlabel('Other Offence by Year')
    axes[1,1].set_ylabel('Frequency')
    axes[1,1].grid(True, alpha=0.3)

    # 6. Deceptive Practice recorded over ten years
    decep_practice_grouped_by_10years.plot(kind='barh', ax=axes[1,2], color='maroon')
    axes[1,2].set_title('Deceptive Practice over ten years')
    axes[1,2].set_xlabel('Deceptive Practice by Year')
    axes[1,2].set_ylabel('Frequency')
    axes[1,2].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

In [None]:
create_crime_visualizations(crime_ten_yrs)