In [13]:
import os
import random
from datetime import datetime
import csv
import pandas as pd
from sklearn.linear_model import LinearRegression

# Create output directory
output_dir = "sensor_data/csv"
os.makedirs(output_dir, exist_ok=True)

# User input for number of times to run
num_iterations = int(input("Enter number of air quality readings to generate: "))

# Generating timestamps with a one minute period
timestamps = pd.date_range(start='2025-01-01', periods=num_iterations, freq='1min')

In [14]:
def classify_health_environment(voc, nh3, h2s, no2):
    score = 0
      # NH₃ (Ammoniak) – z.b Uringeruch
    if nh3 is not None and nh3 > 5:       #5
        score += 1
    # H₂S (Schwefelwasserstoff) – stinkendes Gas
    if h2s is not None and h2s > 0.005:   #0.005
        score += 1
    # VOC (flüchtige organische Verbindungen) – allgemeine Geruchsquelle
    if voc is not None and voc > 450:     #450
        score += 1
    # NO₂ (Stickstoffdioxid) – Schlechte Belüftung
    if no2 is not None and no2 > 300:     #300 Kommt nie vor, no2 hat kein geruch 250 vorher
        score += 1

    if score == 0:
        return "Fresh"
    elif score == 1:
        return "Noticeable"
    else:
        return "Stinky"

In [None]:
# Function to generate random air quality data
def generate_air_quality_data(index, sensor_id=3):

    ts = timestamps[index].to_pydatetime().isoformat()

    nh3 = round(random.uniform(0, 10), 2)
    h2s = round(random.uniform(0, 0.01), 4)
    voc = round(random.uniform(50, 600), 2)
    no2 = round(random.uniform(0, 300), 2) if random.random() > 0.2 else None   # approx. 20% there are no no2 values

    health_environment = classify_health_environment(voc, nh3, h2s, no2)

    return {
        "sensor_id": sensor_id,
        "timestamp": ts,
        "nh3": nh3,
        "h2s": h2s,
        "voc": voc,
        "no2": no2,
        "health_environment": health_environment.lower()
    }

In [16]:
# Impute only no2 values using mean, median, mode or regression
def impute_missing_values(df, strategy='mean'):
    if strategy == 'mean':
        df['no2'] = df['no2'].fillna(df['no2'].mean()).round(2)
    elif strategy == 'median':
         df['no2'] = df['no2'].fillna(df['no2'].median()).round(2)
    elif strategy == 'mode':
        df['no2'] = df['no2'].fillna(df['no2'].mode()).round(2)
    #elif strategy == 'regression':             Haben keinen direkten Zusammenhang zwischen den Werten weshalb des net funktioniert
        # Only impute no2 values using so2 as predictor
    #    hum_train = df.dropna(subset=['no2'])
    #    model = LinearRegression()
    #    model.fit(hum_train[['so2']], hum_train['no2'])
    #    hum_missing = df[df['no2'].isnull()]
    #    df.loc[df['no2'].isnull(), 'no2'] = model.predict(hum_missing[['so2']]).round(2)
    return df

Daten in CSV Speichern

In [17]:
# Function to save data as CSV
def save_air_quality_csv(filename, data):
    if not data:
        return
    filepath = os.path.join(output_dir, filename)
    with open(filepath, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

    print(f"Saved Air Quality CSV: {filepath}")

# Generate multiple batches of air quality data
readings = []
for i in range(num_iterations):
    air_quality_data = generate_air_quality_data(i)
    print(air_quality_data)
    readings.append(air_quality_data)

    # Add a duplicate at every 3rd reading
    if i % 3 == 0 and i != 0:
        readings.append(air_quality_data)
        print(air_quality_data)

# Create DataFrame
df = pd.DataFrame(readings)

# Save raw data (with duplicates)
save_air_quality_csv("air_quality_readings_errors.csv", df.to_dict(orient='records'))

{'sensor_id': 3, 'timestamp': '2025-01-01T00:00:00', 'nh3': 2.68, 'h2s': 0.0058, 'voc': 198.11, 'no2': 243.08, 'health_environment': 'noticeable'}
{'sensor_id': 3, 'timestamp': '2025-01-01T00:01:00', 'nh3': 2.98, 'h2s': 0.003, 'voc': 54.63, 'no2': 273.27, 'health_environment': 'fresh'}
{'sensor_id': 3, 'timestamp': '2025-01-01T00:02:00', 'nh3': 6.02, 'h2s': 0.0051, 'voc': 435.33, 'no2': 172.12, 'health_environment': 'stinky'}
{'sensor_id': 3, 'timestamp': '2025-01-01T00:03:00', 'nh3': 5.79, 'h2s': 0.0024, 'voc': 580.43, 'no2': None, 'health_environment': 'stinky'}
{'sensor_id': 3, 'timestamp': '2025-01-01T00:03:00', 'nh3': 5.79, 'h2s': 0.0024, 'voc': 580.43, 'no2': None, 'health_environment': 'stinky'}
{'sensor_id': 3, 'timestamp': '2025-01-01T00:04:00', 'nh3': 9.21, 'h2s': 0.0055, 'voc': 356.79, 'no2': 176.04, 'health_environment': 'stinky'}
{'sensor_id': 3, 'timestamp': '2025-01-01T00:05:00', 'nh3': 8.96, 'h2s': 0.0017, 'voc': 164.31, 'no2': 298.6, 'health_environment': 'noticeable'}

Doppelte Werte entfernen und Fehlende Ergänzen:

In [18]:
# Remove duplicated rows
df = df.drop_duplicates()

# Impute missing no2 values 
df = impute_missing_values(df, strategy="mean")  #besser, aber anfälliger für ausreißer

# Save cleaned data (without duplicates)
save_air_quality_csv("air_quality_readings.csv", df.to_dict(orient='records'))

#df.to_csv("sensor_data/csv/air_quality_reading.csv", index=False)

Saved Air Quality CSV: sensor_data/csv\air_quality_readings.csv
