In [1]:
from datetime import datetime
from meteostat import Point, Daily, Hourly
import pandas as pd
from meteostat import Stations

In [2]:
current_date = datetime.now()
start = datetime(current_date.year -1, current_date.month, current_date.day)
end = datetime(current_date.year, current_date.month, current_date.day, current_date.hour-1, current_date.minute, current_date.second)

print(start)
print(end)

2024-08-28 00:00:00
2025-08-28 18:46:33


In [3]:
from retrieval import Location

In [4]:
loc = Location("Calw")

In [5]:
station = loc.fetch_station()

station

Unnamed: 0_level_0,name,country,region,wmo,icao,latitude,longitude,elevation,timezone,hourly_start,hourly_end,daily_start,daily_end,monthly_start,monthly_end,distance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
7M2XN,Calw-hirsau,DE,BW,,,48.745,8.7301,329.0,Europe/Berlin,2018-05-23,2025-08-14,NaT,NaT,NaT,NaT,3704.963251


In [6]:
id = station.index.to_numpy()[0]
id

'7M2XN'

In [57]:
data = Hourly(id, start, end)
data = data.fetch()

df = pd.DataFrame(data)
df.head()

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-08-28 00:00:00,16.6,14.2,86.0,0.0,,140.0,7.6,,1017.8,,
2024-08-28 01:00:00,,,,0.0,,,,,,,
2024-08-28 02:00:00,,,,0.0,,,,,,,
2024-08-28 03:00:00,,,,0.0,,,,,,,
2024-08-28 04:00:00,,,,0.0,,,,,,,


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8776 entries, 2024-08-28 00:00:00 to 2025-08-28 18:00:00
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   temp    8422 non-null   Float64
 1   dwpt    8422 non-null   Float64
 2   rhum    8422 non-null   Float64
 3   prcp    8775 non-null   Float64
 4   snow    0 non-null      Float64
 5   wdir    8422 non-null   Float64
 6   wspd    8422 non-null   Float64
 7   wpgt    0 non-null      Float64
 8   pres    8422 non-null   Float64
 9   tsun    0 non-null      Float64
 10  coco    8349 non-null   Float64
dtypes: Float64(11)
memory usage: 917.0 KB


In [59]:
na_cols = [col for col in df.columns if df[col].count() < 1000]
df = df.drop(na_cols, axis = 1)

In [60]:
# coco and temperature can be the dependent variables

In [61]:
coco_dict = {
    1: "Clear",
    2: "Fair",
    3: "Cloudy",
    4: "Overcast",
    5: "Fog",
    6: "Freezing Fog",
    7: "Light Rain",
    8: "Rain",
    9: "Heavy Rain",
    10: "Freezing Rain",
    11: "Heavy Freezing Rain",
    12: "Sleet",
    13: "Heavy Sleet",
    14: "Light Snowfall",
    15: "Snowfall",
    16: "Heavy Snowfall",
    17: "Rain Shower",
    18: "Heavy Rain Shower",
    19: "Sleet Shower",
    20: "Heavy Sleet Shower",
    21: "Snow Shower",
    22: "Heavy Snow Shower",
    23: "Lightning",
    24: "Hail",
    25: "Thunderstorm",
    26: "Heavy Thunderstorm",
    27: "Storm"
}


In [127]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [87]:
horizon = 24

In [88]:
def create_lagged_feature(df, lag = horizon):
    for i in range(1, horizon):
        df[f"coco_t+{i}"] = df["coco"].shift(i)
    
    return df

In [89]:
df_lagged = create_lagged_feature(df)

In [90]:
df_lagged.dropna(inplace = True)

In [91]:
df_lagged

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,wdir,wspd,pres,coco,coco_t+1,coco_t+2,...,coco_t+14,coco_t+15,coco_t+16,coco_t+17,coco_t+18,coco_t+19,coco_t+20,coco_t+21,coco_t+22,coco_t+23
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-09-06 00:00:00,17.4,16.3,93.0,7.2,267.0,16.2,1010.5,7.0,3.0,9.0,...,1.0,2.0,2.0,3.0,5.0,3.0,3.0,3.0,3.0,3.0
2024-09-06 01:00:00,17.4,16.3,93.0,0.2,241.0,14.0,1010.6,9.0,7.0,3.0,...,2.0,1.0,2.0,2.0,3.0,5.0,3.0,3.0,3.0,3.0
2024-09-06 02:00:00,17.2,16.2,94.0,0.0,227.0,11.5,1010.8,9.0,9.0,7.0,...,17.0,2.0,1.0,2.0,2.0,3.0,5.0,3.0,3.0,3.0
2024-09-06 03:00:00,16.6,15.8,95.0,0.0,190.0,9.0,1010.6,9.0,9.0,9.0,...,17.0,17.0,2.0,1.0,2.0,2.0,3.0,5.0,3.0,3.0
2024-09-06 04:00:00,16.6,15.6,94.0,0.0,244.0,6.1,1010.3,3.0,9.0,9.0,...,18.0,17.0,17.0,2.0,1.0,2.0,2.0,3.0,5.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-08-28 02:00:00,18.0,17.7,98.0,1.7,180.0,13.0,1007.5,9.0,9.0,8.0,...,7.0,17.0,1.0,1.0,1.0,3.0,3.0,3.0,18.0,3.0
2025-08-28 03:00:00,18.0,17.4,96.0,1.3,189.0,8.6,1006.8,9.0,9.0,9.0,...,3.0,7.0,17.0,1.0,1.0,1.0,3.0,3.0,3.0,18.0
2025-08-28 04:00:00,17.7,17.4,98.0,1.0,196.0,10.8,1006.1,9.0,9.0,9.0,...,8.0,3.0,7.0,17.0,1.0,1.0,1.0,3.0,3.0,3.0
2025-08-28 05:00:00,17.9,17.4,97.0,0.3,202.0,12.6,1005.9,8.0,9.0,9.0,...,3.0,8.0,3.0,7.0,17.0,1.0,1.0,1.0,3.0,3.0


In [134]:
models = {
    "RFC" : RandomForestClassifier(random_state=42, n_estimators=50),
    "CATB" : CatBoostClassifier(iterations = 100),
    "LGBM" : LGBMClassifier(objective="multiclass", num_class=27),
    }

In [135]:
def train_and_predict(horizon, models):
    
    results = {name: {} for name in  models}
    
    for name, model in models.items():
        
        X = df.drop([f"coco_t+{i}" for i in range(1, horizon)], axis = 1)

        for j in range(1, horizon):
            y = df[f"coco_t+{j}"]
            
            split = int(len(X) * 0.8)
            X_train, X_test = X.iloc[:split], X.iloc[split:]
            y_train, y_test = y.iloc[:split], y.iloc[split:]
            
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            
            acc = accuracy_score(y_test, predictions)
            
            results[name][j] = acc
            
    return results
    
    

In [136]:
train_and_predict(24, models)

Learning rate set to 0.5
0:	learn: 1.1844867	total: 7.96ms	remaining: 788ms
1:	learn: 1.3380145	total: 14.7ms	remaining: 720ms
2:	learn: 1.0537908	total: 19.1ms	remaining: 617ms
3:	learn: 0.9340309	total: 25.7ms	remaining: 617ms
4:	learn: 0.8891845	total: 32.2ms	remaining: 611ms
5:	learn: 0.8690305	total: 38.6ms	remaining: 605ms
6:	learn: 0.8442025	total: 45.3ms	remaining: 602ms
7:	learn: 0.8338628	total: 51.9ms	remaining: 597ms
8:	learn: 0.8197459	total: 57.9ms	remaining: 586ms
9:	learn: 0.8117783	total: 64.4ms	remaining: 580ms
10:	learn: 0.8010829	total: 71.9ms	remaining: 582ms
11:	learn: 0.7956627	total: 78.6ms	remaining: 577ms
12:	learn: 0.7918808	total: 84.9ms	remaining: 569ms
13:	learn: 0.7840972	total: 91.9ms	remaining: 564ms
14:	learn: 0.7809569	total: 98.3ms	remaining: 557ms
15:	learn: 0.7736373	total: 105ms	remaining: 553ms
16:	learn: 0.7690527	total: 113ms	remaining: 554ms
17:	learn: 0.7646350	total: 120ms	remaining: 549ms
18:	learn: 0.7599177	total: 129ms	remaining: 550ms
1

{'RFC': {1: 0.6373025516403402,
  2: 0.574726609963548,
  3: 0.5455650060753341,
  4: 0.5218712029161604,
  5: 0.4933171324422843,
  6: 0.4866342648845686,
  7: 0.4732685297691373,
  8: 0.4647630619684083,
  9: 0.48298906439854195,
  10: 0.4890643985419198,
  11: 0.49817739975698666,
  12: 0.47448359659781286,
  13: 0.4866342648845686,
  14: 0.4842041312272175,
  15: 0.4599027946537059,
  16: 0.44835965978128794,
  17: 0.42345078979343864,
  18: 0.4313487241798299,
  19: 0.42770352369380316,
  20: 0.39185905224787365,
  21: 0.370595382746051,
  22: 0.4113001215066829,
  23: 0.39003645200486026},
 'CATB': {1: 0.6324422843256379,
  2: 0.5814094775212637,
  3: 0.49635479951397327,
  4: 0.488456865127582,
  5: 0.4775212636695018,
  6: 0.47509113001215064,
  7: 0.4726609963547995,
  8: 0.4398541919805589,
  9: 0.4574726609963548,
  10: 0.46051032806804376,
  11: 0.4781287970838396,
  12: 0.4580801944106926,
  13: 0.44835965978128794,
  14: 0.44775212636695016,
  15: 0.425273390036452,
  16: