In [1]:
import requests
import pandas as pd
import io

In [2]:
base_url = "https://data.sfgov.org/resource/wg3w-h783.csv"

In [3]:
# Initialize offset
offset = 0

# Number of records to fetch in each request
limit = 50000

La API tiene un límite de devolver 50000 filas de datos por vez. Si queremos obtener más de 50000 registros, tenemos que manejar esta limitación en nuestro código.

Podemos usar los parámetros ‘$offset’ y ‘$limit’ que provee la API para obtener todos los registros. Empezamos con un offset de 0 y un límite de 50000 (o cualquier otro número hasta el límite máximo permitido por la API). Enviamos un pedido GET a la API con estos parámetros y obtenemos los primeros 50000 registros. Después aumentamos el offset por el límite (50000 en este caso) y enviamos otro pedido para obtener los siguientes 50000 registros. Repetimos este proceso hasta que la API no devuelva datos, lo que significa que hemos obtenido todos los registros.

In [4]:
df = pd.DataFrame()

In [5]:
while True:
    # Construct the URL with the offset and limit parameters
    url = f"{base_url}?$limit={limit}&$offset={offset}"

    # Send a GET request to the API endpoint
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Convert the response content to a pandas DataFrame
        data = pd.read_csv(io.StringIO(response.text))

        # If no data is returned, we've fetched all records
        if data.empty:
            break

        # Append the data to df DataFrame using pandas.concat
        df = pd.concat([df, data], ignore_index=True)

        # Increase the offset by limit
        offset += limit

        print(f"offset is {offset}")
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        break

offset is 50000
offset is 100000
offset is 150000
offset is 200000
offset is 250000
offset is 300000
offset is 350000
offset is 400000
offset is 450000
offset is 500000
offset is 550000
offset is 600000
offset is 650000
offset is 700000
offset is 750000
offset is 800000
offset is 850000


In [7]:
df.sample(2).transpose()

Unnamed: 0,294935,282174
incident_datetime,2022-09-03T21:30:00.000,2022-05-31T11:30:00.000
incident_date,2022-09-03T00:00:00.000,2022-05-31T00:00:00.000
incident_time,21:30,11:30
incident_year,2022,2022
incident_day_of_week,Saturday,Tuesday
report_datetime,2022-09-03T21:48:00.000,2022-05-31T15:32:00.000
row_id,118899712030,115623228165
incident_id,1188997,1156232
incident_number,220598881,220356322
cad_number,222463041.0,221512039.0


To ADD the weather on that day

In [None]:
import datetime

# Define a function to fetch weather data
def fetch_weather(date):
    # Convert date to Unix timestamp
    timestamp = int(datetime.strptime(date, '%Y-%m-%d').timestamp())
    # Replace 'your_api_key' with your actual OpenWeatherMap API key
    response = requests.get(f'http://api.openweathermap.org/data/2.5/weather?lat=37.7749&lon=-122.4194&dt={timestamp}&appid=your_api_key')
    data = response.json()
    return data['main']['temp']

# Fetch weather data for each unique date
weather_data = {date: fetch_weather(date) for date in unique_dates}

# Create a new DataFrame from the weather data
weather_df = pd.DataFrame.from_dict(weather_data, orient='index', columns=['weather'])

# Merge the weather data with the original DataFrame
df = df.merge(weather_df, left_on='incident_date', right_index=True)