In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

In [5]:
df = pd.read_csv('data.csv')

df.head()

Unnamed: 0,time,co2,temp,humid,count
0,1759147574,1100,26.65026,62.31479,9.0
1,1759147604,1109,25.30709,66.88182,9.0
2,1759147634,1104,24.61814,69.08675,9.0
3,1759147664,1103,24.15885,70.19302,9.0
4,1759147694,1099,23.80903,70.87358,9.0


In [6]:
df["timestamp"] = pd.to_datetime(df["time"], unit="s")
df = df.sort_values("timestamp")

df.head()

Unnamed: 0,time,co2,temp,humid,count,timestamp
0,1759147574,1100,26.65026,62.31479,9.0,2025-09-29 12:06:14
1,1759147604,1109,25.30709,66.88182,9.0,2025-09-29 12:06:44
2,1759147634,1104,24.61814,69.08675,9.0,2025-09-29 12:07:14
3,1759147664,1103,24.15885,70.19302,9.0,2025-09-29 12:07:44
4,1759147694,1099,23.80903,70.87358,9.0,2025-09-29 12:08:14


In [7]:
df["time_of_day_minute"] = df["timestamp"].dt.hour * 60 + df["timestamp"].dt.minute
df["day_of_week"] = df["timestamp"].dt.dayofweek

df.head()

Unnamed: 0,time,co2,temp,humid,count,timestamp,time_of_day_minute,day_of_week
0,1759147574,1100,26.65026,62.31479,9.0,2025-09-29 12:06:14,726,0
1,1759147604,1109,25.30709,66.88182,9.0,2025-09-29 12:06:44,726,0
2,1759147634,1104,24.61814,69.08675,9.0,2025-09-29 12:07:14,727,0
3,1759147664,1103,24.15885,70.19302,9.0,2025-09-29 12:07:44,727,0
4,1759147694,1099,23.80903,70.87358,9.0,2025-09-29 12:08:14,728,0


In [8]:
df["co2_ema_15"] = df["co2"].ewm(span=15, adjust=False).mean()
df["dco2"] = df["co2"].diff().fillna(0)

df.head()

Unnamed: 0,time,co2,temp,humid,count,timestamp,time_of_day_minute,day_of_week,co2_ema_15,dco2
0,1759147574,1100,26.65026,62.31479,9.0,2025-09-29 12:06:14,726,0,1100.0,0.0
1,1759147604,1109,25.30709,66.88182,9.0,2025-09-29 12:06:44,726,0,1101.125,9.0
2,1759147634,1104,24.61814,69.08675,9.0,2025-09-29 12:07:14,727,0,1101.484375,-5.0
3,1759147664,1103,24.15885,70.19302,9.0,2025-09-29 12:07:44,727,0,1101.673828,-1.0
4,1759147694,1099,23.80903,70.87358,9.0,2025-09-29 12:08:14,728,0,1101.3396,-4.0


In [10]:
features = ["co2","temp","humid","co2_ema_15","dco2","time_of_day_minute","day_of_week"]
X = df[features].fillna(0)
y = df["count"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [12]:
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

In [13]:
preds = np.clip(model.predict(X_test), 0, None).round()
mae = mean_absolute_error(y_test, preds)
print("MAE:", mae)

ValueError: Input contains NaN.