In [1]:
1+8

9

In [2]:
import requests
import pandas as pd
import random
from tqdm import tqdm

# ==========================
# CONFIG
# ==========================
BASE_URL = "https://data.cityofchicago.org/resource/ijzp-q8t2.json"
OUTPUT_FILE = "PatrolIQ_Chicago_Crime_500K.csv"

TARGET_ROWS = 500_000
PAGE_LIMIT = 50_000

IMPORTANT_COLUMNS = [
    "id",
    "case_number",
    "date",
    "primary_type",
    "description",
    "location_description",
    "arrest",
    "domestic",
    "district",
    "ward",
    "community_area",
    "latitude",
    "longitude"
]

# ==========================
# FETCH DATA
# ==========================
def fetch_page(offset):
    params = {
        "$limit": PAGE_LIMIT,
        "$offset": offset
    }
    response = requests.get(BASE_URL, params=params)
    response.raise_for_status()
    return response.json()

# ==========================
# MAIN LOGIC
# ==========================
def main():
    print("üöî PatrolIQ Dataset Builder Started...\n")

    all_data = []
    offset = 0

    while len(all_data) < TARGET_ROWS:
        print(f"Fetching records from offset {offset}")
        page = fetch_page(offset)

        if not page:
            print("No more data from API.")
            break

        all_data.extend(page)
        offset += PAGE_LIMIT

        if len(page) < PAGE_LIMIT:
            break

    print(f"\nTotal records fetched: {len(all_data)}")

    # ==========================
    # DATAFRAME
    # ==========================
    df = pd.DataFrame(all_data)

    # Keep only useful columns
    df = df[IMPORTANT_COLUMNS]

    # ==========================
    # DATA CLEANING (BEST DATA)
    # ==========================
    df.dropna(subset=["primary_type", "date", "location_description"], inplace=True)
    df.drop_duplicates(subset=["case_number"], inplace=True)

    # Convert date
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df.dropna(subset=["date"], inplace=True)

    print(f"Clean records available: {len(df)}")

    # ==========================
    # RANDOM SAMPLING
    # ==========================
    if len(df) > TARGET_ROWS:
        df = df.sample(n=TARGET_ROWS, random_state=42)

    # ==========================
    # SAVE CSV
    # ==========================
    df.to_csv(OUTPUT_FILE, index=False)

    print("\n‚úÖ DATASET READY!")
    print(f"üìÅ File saved as: {OUTPUT_FILE}")
    print(f"üìä Rows: {len(df)} | Columns: {df.shape[1]}")

if __name__ == "__main__":
    main()


üöî PatrolIQ Dataset Builder Started...

Fetching records from offset 0
Fetching records from offset 50000
Fetching records from offset 100000
Fetching records from offset 150000
Fetching records from offset 200000
Fetching records from offset 250000
Fetching records from offset 300000
Fetching records from offset 350000
Fetching records from offset 400000
Fetching records from offset 450000

Total records fetched: 500000
Clean records available: 497793

‚úÖ DATASET READY!
üìÅ File saved as: PatrolIQ_Chicago_Crime_500K.csv
üìä Rows: 497793 | Columns: 13
