In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import requests
import json
import math

In [52]:
df = pd.read_csv("aq_sensor_data.csv")
df = df[['IdSensore', 'Data', 'Valore']]
df['Data'] = pd.to_datetime(df['Data'], format="%d/%m/%Y %I:%M:%S %p", errors="coerce") #01/01/2023 12:00:00 AM

In [53]:
df = df.sort_values(by='Data').loc[df['Data'] >= '2024-1-1'].reset_index(drop=True)
df

Unnamed: 0,IdSensore,Data,Valore
0,5579,2024-01-01 00:00:00,21.3
1,12017,2024-01-01 00:00:00,27.7
2,9971,2024-01-01 00:00:00,0.6
3,6246,2024-01-01 00:00:00,24.4
4,6280,2024-01-01 00:00:00,26.2
...,...,...,...
199517,20494,2024-01-29 10:00:00,187.4
199518,20493,2024-01-29 10:00:00,341.6
199519,20491,2024-01-29 10:00:00,41.1
199520,20465,2024-01-29 10:00:00,3.8


In [54]:
complete_air_quality_stations = pd.read_csv("complete_air_quality_stations.csv")

In [58]:
sensors_in_data = list(df.IdSensore.unique())
stations_sensors = list(complete_air_quality_stations.IdSensore.unique())

In [62]:
# Nulling stations that do not exist
missing_sensors = []
for data_sensor in sensors_in_data:
    exist = False
    for station_sensor in stations_sensors:
        if station_sensor == data_sensor: exist = True
    if not exist: missing_sensors.append(data_sensor)
    
missing_sensors

[20525, 20564, 20565]

In [63]:
# Remove observations of stations that do not exist
for sensor in missing_sensors:
    print(sensor)
    df = df.loc[df["IdSensore"] != sensor]

df

20525
20564
20565


Unnamed: 0,IdSensore,Data,Valore
0,5579,2024-01-01 00:00:00,21.3
1,12017,2024-01-01 00:00:00,27.7
2,9971,2024-01-01 00:00:00,0.6
3,6246,2024-01-01 00:00:00,24.4
4,6280,2024-01-01 00:00:00,26.2
...,...,...,...
199517,20494,2024-01-29 10:00:00,187.4
199518,20493,2024-01-29 10:00:00,341.6
199519,20491,2024-01-29 10:00:00,41.1
199520,20465,2024-01-29 10:00:00,3.8


In [64]:
def arrange_timestamp(date_time):
    return str(date_time)

In [69]:
base_url_measurements = "http://127.0.0.1:8000/api/collections/airqualitymeasurement/"

batch_size = 1000
batches = math.ceil(len(df) / batch_size)
#batches = 1
print(f"Number of batches: {batches}")

for i in range(0, batches):
    body = {
        "bulk": True,
        "items": []
    }
    
    i_start = i * batch_size
    i_end = len(df) if len(df) <= (batch_size * (i+1)) else (batch_size * (i+1))
    
    print(f'from {i_start} to {i_end}')
    timeseries_subset = df.iloc[i_start:i_end]
    timeseries_subset = timeseries_subset[['IdSensore','Data','Valore']]
    timeseries_subset['Data'] = timeseries_subset['Data'].apply(arrange_timestamp)
    timeseries_subset = timeseries_subset.replace({np.nan: None})
    
    for j, sensor in timeseries_subset.iterrows():
        item = { "sensor_id_id": sensor.IdSensore, "date": sensor.Data, "value": sensor.Valore }
        body["items"].append(item)
        
    body_json = json.dumps(body)
    body_json
    req = requests.post(base_url_measurements, body_json)
    if req.status_code == 500:
        print(req)
    print(f'request {i} - {req.status_code}')

Number of batches: 199
from 0 to 1000
request 1004 - 200
from 1000 to 2000
request 2008 - 200
from 2000 to 3000
request 3012 - 200
from 3000 to 4000
request 4015 - 200
from 4000 to 5000
request 5019 - 200
from 5000 to 6000
request 6022 - 200
from 6000 to 7000
request 7026 - 200
from 7000 to 8000
request 8031 - 200
from 8000 to 9000
request 9035 - 200
from 9000 to 10000
request 10038 - 200
from 10000 to 11000
request 11041 - 200
from 11000 to 12000
request 12045 - 200
from 12000 to 13000
request 13049 - 200
from 13000 to 14000
request 14053 - 200
from 14000 to 15000
request 15057 - 200
from 15000 to 16000
request 16061 - 200
from 16000 to 17000
request 17064 - 200
from 17000 to 18000
request 18068 - 200
from 18000 to 19000
request 19071 - 200
from 19000 to 20000
request 20074 - 200
from 20000 to 21000
request 21078 - 200
from 21000 to 22000
request 22084 - 200
from 22000 to 23000
request 23087 - 200
from 23000 to 24000
request 24090 - 200
from 24000 to 25000
request 25094 - 200
from 250

In [35]:
body = {
    "bulk": True,
    "items": []
}

timeseries_subset = df.iloc[0:1]
timeseries_subset = timeseries_subset[['IdSensore','Data','Valore']]
timeseries_subset['Data'] = timeseries_subset['Data'].apply(arrange_timestamp)
timeseries_subset = timeseries_subset.replace({np.nan: None})

for i, sensor in timeseries_subset.iterrows():
    item = { "sensor_id": sensor.IdSensore, "date": sensor.Data, "value": sensor.Valore }
    body["items"].append(item)
body_json = json.dumps(body)
body_json

'{"bulk": true, "items": [{"sensor_id_id": 5504, "date": "2023-01-01 00:00:00", "value": 42.1}]}'

In [55]:
count_elements = 2816286

size_b = 431521792
print(f'{size_b / count_elements} bytes per element in table')
print()

size_id = 22530288
print(f'{size_id / count_elements} bytes per element in column id')

size_date = 22530288
print(f'{size_date / count_elements} bytes per element in column date')

size_value = 18191588
print(f'{size_value / count_elements} bytes per element in column value')

size_fk = 11265144
print(f'{size_fk / count_elements} bytes per element in column fk')

print()
total_size = size_id + size_date + size_value + size_fk
print(f'{total_size} total size in bytes')
print(f'{total_size / count_elements} bytes per data element')
print(f'{total_size/1000000} MB')
print()

index_size = 83369984
print(f'{index_size / count_elements} bytes per element in index')

153.22371094412998 bytes per element in table

8.0 bytes per element in column id
8.0 bytes per element in column date
6.45942493056458 bytes per element in column value
4.0 bytes per element in column fk

74517308 total size in bytes
26.45942493056458 bytes per data element
74.517308 MB

29.60281164626036 bytes per element in index


In [52]:
total_data_AQ = 2800000 + 14800000 + 37900000 + 42900000
total_data_AQ

est_b_per_element = 154
est_in_db = total_data_AQ * 154
print(f'{est_in_db} bytes')
print(f'{est_in_db / 1000000} MB')
print(f'{est_in_db / 1000000000} GB')

15153600000 bytes
15153.6 MB
15.1536 GB


In [54]:
83369984

4278260.869565218