In [33]:
import requests
import time
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from dotenv import load_dotenv
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

load_dotenv()

True

### Verbindung zur Datenbank aufbauen und Daten aus der Tabelle laden

In [34]:
host = os.getenv('HOST')
port = os.getenv('PORT')
user = os.getenv('USER')
password = os.getenv('PASSWORD')
database= os.getenv('DATABASE')

In [44]:
# Verbindung zur Datenbank herstellen
connection = psycopg2.connect(host=host, port=port, database=database, user=user, password=password)
print("Verbindung zur Datenbank erfolgreich hergestellt.")

# Einen Cursor erstellen, um SQL-Abfragen auszuführen
cursor = connection.cursor()

# SQL SELECT-Abfrage
select_query = "SELECT * FROM value;"

# Abfrage ausführen
cursor.execute(select_query)

# Ergebnis abrufen
rows = cursor.fetchall()

# Ergebnis in ein Pandas DataFrame umwandeln
df = pd.DataFrame(rows, columns=[desc[0] for desc in cursor.description])

# Cursor und Verbindung schließen
cursor.close()
connection.close()


Verbindung zur Datenbank erfolgreich hergestellt.


In [47]:
df_new = df

### Random Forest Modell

In [42]:
# 1. Convert the time column to datetime objects
df['time'] = pd.to_datetime(df['time'])

# 2. Feature Engineering with Lags
lags = [1, 2, 3]  # Lags for 10, 20, and 30 minutes
for lag in lags:
    df[f'Temperatur_lag_{lag*10}'] = df.groupby('senseid_fk')['temperature'].shift(lag)

# 3. Create target variables for each prediction horizon
for minutes in [10, 20, 30]:
    shift = minutes // 10  # Calculate the shift based on 10-minute intervals
    df[f'Temperatur_in_{minutes}_Minuten'] = df.groupby('senseid_fk')['temperature'].shift(-shift)

# 4. Remove rows with missing values (introduced by lagging and shifting)
#df.dropna(inplace=True)

In [37]:
# 5. Define target variables and create a dictionary for models
target_variables = ['Temperatur_in_10_Minuten', 'Temperatur_in_20_Minuten', 'Temperatur_in_30_Minuten']
models = {}

# 6. Train and evaluate models for each target variable
for target in target_variables:
    # 6.1. Select features and target variable
    X = df.drop(target_variables + ["time","index"], axis=1)
    y = df[target]
    
    # 6.2. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 6.3. Define the preprocessing for the 'senseid_fk' column
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['senseid_fk'])
        ],
        remainder='passthrough'
    )
    
    # 6.4. Create the pipeline with preprocessing and model training
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators=100, max_depth=40, random_state=42, n_jobs=-1))
    ])
    
    # 6.5. Train the model
    pipeline.fit(X_train, y_train)
    
    # 6.6. Make predictions and evaluate
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"\n{target}:")
    print(f"  Mean Squared Error: {mse}")
    print(f"  Mean Absolute Error: {mae}")
    
    # 6.7. Store the trained model
    models[target] = pipeline


Temperatur_in_10_Minuten:
  Mean Squared Error: 0.11370882209558632
  Mean Absolute Error: 0.11445594621593094

Temperatur_in_20_Minuten:
  Mean Squared Error: 0.2736110550013331
  Mean Absolute Error: 0.1825184812926819

Temperatur_in_30_Minuten:
  Mean Squared Error: 0.4031769946027392
  Mean Absolute Error: 0.23847418424732283


In [38]:
minuten_10 = models["Temperatur_in_10_Minuten"]
minuten_20 = models["Temperatur_in_20_Minuten"]
minuten_30 = models["Temperatur_in_30_Minuten"]

### Predicten

In [61]:
station = '5ae4a726223bd80019a367a6'

In [68]:
# Verbindung zur Datenbank herstellen
connection = psycopg2.connect(host=host, port=port, database=database, user=user, password=password)
print("Verbindung zur Datenbank erfolgreich hergestellt.")

# Einen Cursor erstellen, um SQL-Abfragen auszuführen
cursor = connection.cursor()

# SQL SELECT-Abfrage
select_query = """
    SELECT *
    FROM (
        SELECT *, ROW_NUMBER() OVER (PARTITION BY senseid_fk ORDER BY time DESC) AS r
        FROM value
    ) sub
    WHERE r <= 4;
"""

# Abfrage ausführen
cursor.execute(select_query)

# Ergebnis abrufen
rows = cursor.fetchall()

# Ergebnis in ein Pandas DataFrame umwandeln
df_new = pd.DataFrame(rows, columns=[desc[0] for desc in cursor.description])

# Cursor und Verbindung schließen
cursor.close()
connection.close()


Verbindung zur Datenbank erfolgreich hergestellt.


In [69]:
df_new

Unnamed: 0,index,time,temperature,luftfeuchtigkeit,pm10,pm2_5,senseid_fk,r
0,1195389,2024-06-07 15:20:06,18.3,96.8,10.1,9.2,590e0b0a51d3460011c725c4,1
1,1195381,2024-06-07 15:10:07,18.7,95.9,9.53,8.33,590e0b0a51d3460011c725c4,2
2,1195373,2024-06-07 15:00:07,19.0,94.8,9.4,8.5,590e0b0a51d3460011c725c4,3
3,1195365,2024-06-07 14:50:05,19.2,94.2,10.03,8.9,590e0b0a51d3460011c725c4,4
4,1195384,2024-06-07 15:20:03,26.2,42.0,7.37,6.0,5ae4a726223bd80019a367a6,1
5,1195376,2024-06-07 15:10:04,26.3,43.5,6.7,5.13,5ae4a726223bd80019a367a6,2
6,1195368,2024-06-07 15:00:05,26.5,43.9,6.73,5.9,5ae4a726223bd80019a367a6,3
7,1195360,2024-06-07 14:50:03,26.5,45.8,7.7,6.07,5ae4a726223bd80019a367a6,4
8,1195387,2024-06-07 15:20:05,18.33,76.93,1.3,1.2,5b4d11485dc1ec001b5452c7,1
9,1195379,2024-06-07 15:10:06,18.43,78.26,2.17,1.27,5b4d11485dc1ec001b5452c7,2


In [73]:
# Zeitspalte in datetime-Objekte umwandeln
df_new['time'] = pd.to_datetime(df_new['time'])

# Berechne die Lags für die neuen Daten
lags = [1, 2, 3]  # Lags für 10, 20 und 30 Minuten
for lag in lags:
    df_new[f'Temperatur_lag_{lag*10}'] = df_new.groupby('senseid_fk')['temperature'].shift(lag)

# Fehlende Werte durch geeignete Werte ersetzen, falls notwendig
#df_all.fillna(method='ffill', inplace=True)

# Liste der einzigartigen Stationen
stations = df_new['senseid_fk'].unique()

# Dictionary zum Speichern der Vorhersagen für jede Station
all_predictions = {}

for station in stations:
    # Daten für die aktuelle Station auswählen
    df_station = df_new[df_new['senseid_fk'] == station]
    
    # Neueste Zeile für Vorhersagen auswählen
    latest_row = df_station.iloc[[-1]]
    
    # Vorhersagen treffen
    predictions = {}
    for target in target_variables:
        model = models[target]
        # Die Features für das Modell auswählen (identisch zu den Trainingsdaten)
        X_new = latest_row.drop(target_variables + ["time", "index"], axis=1, errors='ignore')
        
        # Vorhersage durchführen
        prediction = model.predict(X_new)
        predictions[target] = prediction[0]
    
    # Ergebnisse für die aktuelle Station speichern
    all_predictions[station] = predictions

# Ergebnisse in ein DataFrame umwandeln
df_predictions = pd.DataFrame(all_predictions)

# Ergebnisse anzeigen
df_predictions

Unnamed: 0,590e0b0a51d3460011c725c4,5ae4a726223bd80019a367a6,5b4d11485dc1ec001b5452c7,5c08379b1c28f9001a3f580c,5dde9523ba7944001da4150e,605f498077a88b001bba3dc0,64722d1c9be0580007f776d9,65e8d93acbf5700007f920ca
Temperatur_in_10_Minuten,19.2897,26.6162,18.517162,30.285,24.908987,22.770002,22.4979,25.400009
Temperatur_in_20_Minuten,19.5756,26.6542,18.514278,30.452,24.949254,22.757584,22.536,25.399911
Temperatur_in_30_Minuten,19.5565,26.8739,18.458549,30.68,24.956299,22.747073,22.4522,25.40003


### Mit GridSearch und CV

In [39]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.metrics import mean_squared_error, mean_absolute_error

# # 5. Define target variables and create a dictionary for models
# target_variables = ['Temperatur_in_10_Minuten', 'Temperatur_in_20_Minuten', 'Temperatur_in_30_Minuten']
# models = {}

# # Hyperparameter grid for Random Forest
# param_grid = {
#     'n_estimators': [300, 400],
#     'max_depth': [30, 40, 50, 60, 70],
# }

# # 6. Train and evaluate models for each target variable
# for target in target_variables:
#     # 6.1. Select features and target variable
#     X = df.drop(target_variables + ["time", "senseid_fk"], axis=1)
#     y = df[target]

#     # 6.2. Split data into training and testing sets
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#     # 6.3. Perform GridSearchCV to find the best hyperparameters
#     rf = RandomForestRegressor(random_state=42)
#     grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
#     grid_search.fit(X_train, y_train)

#     # 6.4. Get the best model from grid search
#     best_model = grid_search.best_estimator_

#     # 6.5. Make predictions and evaluate
#     y_pred = best_model.predict(X_test)
#     mse = mean_squared_error(y_test, y_pred)
#     mae = mean_absolute_error(y_test, y_pred)
#     print(f"\n{target}:")
#     print(f"  Best Parameters: {grid_search.best_params_}")
#     print(f"  Mean Squared Error: {mse}")
#     print(f"  Mean Absolute Error: {mae}")

#     # 6.6. Store the trained model
#     models[target] = best_model
