# Stream Learning

In [18]:
import numpy as np

import pandas as pd
from river import stream
import rich

In [40]:
data = pd.read_csv("./air.csv")
targets = data.pop("T")

temperature = stream.iter_csv("./air.csv", target="T")

data.describe()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH
count,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0
mean,-34.207524,1048.990061,-159.090093,1.865683,894.595276,168.616971,794.990168,58.148873,1391.479641,975.072032,39.48538,-6.837604
std,77.65717,329.83271,139.789093,41.380206,342.333252,257.433866,321.993552,126.940455,467.210125,456.938184,51.216145,38.97667
min,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
25%,0.6,921.0,-200.0,4.0,711.0,50.0,637.0,53.0,1185.0,700.0,34.1,0.6923
50%,1.5,1053.0,-200.0,7.9,895.0,141.0,794.0,96.0,1446.0,942.0,48.6,0.9768
75%,2.6,1221.0,-200.0,13.6,1105.0,284.0,960.0,133.0,1662.0,1255.0,61.9,1.2962
max,11.9,2040.0,1189.0,63.7,2214.0,1479.0,2683.0,340.0,2775.0,2523.0,88.7,2.231


In [45]:
for col in data.columns:
    print(col)

data['Time'][0]

column_types = data.dtypes
print(column_types)

print(targets.dtypes)

Date
Time
CO(GT)
PT08.S1(CO)
NMHC(GT)
C6H6(GT)
PT08.S2(NMHC)
NOx(GT)
PT08.S3(NOx)
NO2(GT)
PT08.S4(NO2)
PT08.S5(O3)
RH
AH
Date              object
Time              object
CO(GT)           float64
PT08.S1(CO)      float64
NMHC(GT)         float64
C6H6(GT)         float64
PT08.S2(NMHC)    float64
NOx(GT)          float64
PT08.S3(NOx)     float64
NO2(GT)          float64
PT08.S4(NO2)     float64
PT08.S5(O3)      float64
RH               float64
AH               float64
dtype: object
float64


In [99]:
from river import compose, linear_model, preprocessing, metrics, tree, evaluate, utils, model_selection
from datetime import datetime
from river.stream import iter_pandas

def get_date_features(x):
    month = int(x['Date'].split("/")[1])
    season = (month % 12)//3

    time_format = "%H:%M:%S"
    datetime_object = datetime.strptime(x['Time'], time_format)

    hour_of_day = datetime_object.hour

    time_cos = np.cos(hour_of_day)
    time_sin = np.sin(hour_of_day)

    return {'month':month, 'season':season, 'time_cos': time_cos, 'time_sin':time_sin}

model = compose.Pipeline(
    ('features', compose.TransformerUnion(
        ('date_features', compose.FuncTransformer(get_date_features)),
    )),
    ('drop_non_features', compose.Discard('Date', 'Time')),
    ('scale', preprocessing.StandardScaler()),
    # tree.HoeffdingAdaptiveTreeRegressor(
    #     grace_period=50,
    #     model_selector_decay=0.3,
    # )
    linear_model.LinearRegression()
)

models = utils.expand_param_grid(model, {
    'LinearRegression': {
        'optimizer': [
            (optim.SGD, {'lr': [.1, .01, .005]}),
            (optim.Adam, {'beta_1': [.01, .001], 'lr': [.1, .01, .001]}),
            (optim.Adam, {'beta_1': [.1], 'lr': [.001]}),
        ]
    }
})

sh = model_selection.SuccessiveHalvingRegressor(
    models,
    metric=metrics.MAE(),
    budget=2000,
    eta=2,
    verbose=True
)

metric = metrics.MAE()

river_stream = stream.iter_pandas(X=data, y=targets)

evaluate.progressive_val_score(
    dataset=river_stream,
    model=sh,
    metric=metrics.MAE()
)

# for x, y in river_stream:
#     # Make a prediction without using the target
#     y_pred = model.predict_one(x)

#     # Update the model using the target
#     model.learn_one(x, y)

#     # Update the metric using the out-of-fold prediction
#     metric.update(y, y_pred)

# evaluate.progressive_val_score(river_stream, model, metric)

#print("x:", x, "Metric:", metric)

[1]	5 removed	5 left	50 iterations	budget used: 500	budget left: 1500	best MAE: 6.94481
[2]	2 removed	3 left	100 iterations	budget used: 1000	budget left: 1000	best MAE: 6.07929
[3]	1 removed	2 left	166 iterations	budget used: 1498	budget left: 502	best MAE: 5.543158
[4]	1 removed	1 left	250 iterations	budget used: 1998	budget left: 2	best MAE: 4.772874
x: {'Date': '04/04/2005', 'Time': '14:00:00', 'CO(GT)': 2.2, 'PT08.S1(CO)': 1071.0, 'NMHC(GT)': -200.0, 'C6H6(GT)': 11.9, 'PT08.S2(NMHC)': 1047.0, 'NOx(GT)': 265.0, 'PT08.S3(NOx)': 654.0, 'NO2(GT)': 168.0, 'PT08.S4(NO2)': 1129.0, 'PT08.S5(O3)': 816.0, 'RH': 13.1, 'AH': 0.5028} Metric: MAE: 0.
