In [None]:
import requests
import pandas as pd

API_URL = "https://data.cityofchicago.org/resource/ijzp-q8t2.json"

params = {
    "$limit": 5000, # Number of records to fetch
    "$select": "date,primary_type,latitude,longitude,location_description,arrest,year",
    "$order": "date DESC"
}

response = requests.get(API_URL, params=params)
response.raise_for_status()

raw_data = response.json()

# Save raw JSON
pd.DataFrame(raw_data).to_json(
    "../data/raw/chicago_crime_raw.json",
    orient="records",
    indent=2
)

print(f"Fetched {len(raw_data)} records")

Fetched 5000 records


In [None]:

df = pd.DataFrame(raw_data)

# Convert date column
df["date"] = pd.to_datetime(df["date"])

# Convert lat/long to numeric
df["latitude"] = pd.to_numeric(df["latitude"], errors="coerce")
df["longitude"] = pd.to_numeric(df["longitude"], errors="coerce")

# Drop rows without location
df = df.dropna(subset=["latitude", "longitude"])

# Normalize crime type
df["primary_type"] = df["primary_type"].str.lower().str.strip()

print(df.info())


<class 'pandas.core.frame.DataFrame'>
Index: 4984 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   date                  4984 non-null   datetime64[ns]
 1   primary_type          4984 non-null   object        
 2   latitude              4984 non-null   float64       
 3   longitude             4984 non-null   float64       
 4   location_description  4966 non-null   object        
 5   arrest                4984 non-null   bool          
 6   year                  4984 non-null   object        
dtypes: bool(1), datetime64[ns](1), float64(2), object(3)
memory usage: 277.4+ KB
None


In [3]:
print(df.head())

print("\nCrime type distribution:")
print(df["primary_type"].value_counts().head(10))

print("\nDate range:")
print(df["date"].min(), "→", df["date"].max())


        date         primary_type   latitude  longitude location_description  \
0 2026-01-25              assault  41.731145 -87.574354            RESIDENCE   
1 2026-01-25   deceptive practice  41.873808 -87.694456      OTHER (SPECIFY)   
2 2026-01-25              battery  41.751810 -87.724634            RESIDENCE   
3 2026-01-25  motor vehicle theft  41.980779 -87.805438               STREET   
4 2026-01-25              battery  41.779998 -87.629295          GAS STATION   

   arrest  year  
0   False  2026  
1   False  2026  
2   False  2026  
3   False  2026  
4   False  2026  

Crime type distribution:
primary_type
theft                  1149
battery                 908
criminal damage         495
motor vehicle theft     441
assault                 432
other offense           376
deceptive practice      243
burglary                238
narcotics               163
criminal trespass       139
Name: count, dtype: int64

Date range:
2026-01-14 21:00:00 → 2026-01-25 00:00:00


In [None]:
df.to_csv(
    "../data/processed/chicago_crime_clean.csv",
    index=False
)

print("Cleaned data saved successfully")


Cleaned data saved successfully
