In [None]:
import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingRegressor

import lightgbm as lgb


In [None]:
!pip install optuna


Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.5.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_set_path = '/content/drive/MyDrive/Aindo Hackathon Feb 3-4 2024/data_synth_train.csv'
df_data = pd.read_csv(train_set_path)

In [None]:
df_data.head()

Unnamed: 0,timestamp,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,Living room,Bedroom,Bathroom,Kitchen,Hallway
0,2019-11-07 00:00:07,21.26,57.74,109,531,489,705,659,600,122,0,0,0,0,0
1,2019-11-07 00:00:27,21.19,58.03,107,533,494,707,658,603,122,0,0,0,0,0
2,2019-11-07 00:00:47,21.11,57.53,107,530,491,708,656,596,121,1,0,0,0,0
3,2019-11-07 00:01:07,21.12,57.94,106,532,489,710,656,597,126,1,0,0,0,0
4,2019-11-07 00:01:27,21.06,58.18,103,532,490,705,658,597,130,1,0,0,0,0


In [None]:
df_data.tail()

Unnamed: 0,timestamp,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,Living room,Bedroom,Bathroom,Kitchen,Hallway
304563,2020-01-17 03:51:43,22.05,55.2,102,530,441,692,651,583,108,0,0,0,0,0
304564,2020-01-17 03:52:03,21.93,55.13,104,529,441,694,651,583,108,0,0,0,0,0
304565,2020-01-17 03:52:23,22.09,55.12,101,529,443,694,651,584,109,0,0,0,0,0
304566,2020-01-17 03:52:43,22.04,55.22,101,529,441,694,651,585,108,0,0,0,0,0
304567,2020-01-17 03:53:03,21.97,55.22,100,528,440,694,651,584,109,0,0,0,0,0


In [None]:
df_data.shape

(304568, 15)

In [None]:
test_set_path = '/content/drive/MyDrive/Aindo Hackathon Feb 3-4 2024/data_synth_test.csv'
df_test_data = pd.read_csv(test_set_path)

In [None]:
df_test_data.head()

Unnamed: 0,timestamp,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue
0,2020-01-17 03:53:23,21.98,55.12,102,530,444,695,652,586,109
1,2020-01-17 03:53:43,22.08,55.07,102,529,444,696,650,585,107
2,2020-01-17 03:54:03,22.14,55.13,102,528,442,694,651,583,108
3,2020-01-17 03:54:23,22.07,55.21,102,529,442,694,651,585,108
4,2020-01-17 03:54:43,22.03,55.07,101,530,443,694,651,583,108


In [None]:
df_test_data.tail()

Unnamed: 0,timestamp,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue
64076,2020-01-31 23:58:37,22.85,52.94,73,535,467,691,647,572,129
64077,2020-01-31 23:58:57,22.79,52.99,72,535,467,691,646,572,129
64078,2020-01-31 23:59:17,22.74,53.05,72,535,467,691,646,572,129
64079,2020-01-31 23:59:37,22.71,53.42,73,535,467,691,646,572,128
64080,2020-01-31 23:59:57,22.71,53.42,72,535,467,691,646,572,128


In [None]:
target_cols = ['Living room',	'Bedroom',	'Bathroom',	'Kitchen',	'Hallway']
df_data = pl.read_csv(train_set_path, try_parse_dates=True)
df_target = df_data.select(target_cols)

schema_data = df_data.schema
schema_target = df_target.schema

# Features Generation

Polars library is faster than Pandas, which explains the choice.

In [None]:
# def generate_features(df_data):

#     df_data = (
#         df_data
#         .with_columns(
#             pl.col("timestamp").cast(pl.Date).alias("date"),
#         )
#     )

#     df_data = (
#         df_data
#         .with_columns(
#             pl.col("timestamp").dt.second().alias("second"),
#             pl.col("timestamp").dt.minute().alias("minute"),
#             pl.col("timestamp").dt.hour().alias("hour"),
#             #pl.col("timestamp").dt.day().alias("day"),
#             pl.col("timestamp").dt.weekday().alias("weekday"),


#         )

#         # cyclical features encoding https://towardsdatascience.com/cyclical-features-encoding-its-about-time-ce23581845ca
#         .with_columns(
#             # cyclic hour
#             (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
#             (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
#             # cyclic minute
#             (np.pi * pl.col("minute") / 60).sin().alias("sin(minute)"),
#             (np.pi * pl.col("minute") / 60).cos().alias("cos(minute)"),
#             # cyclic second
#             (np.pi * pl.col("second") / 60).sin().alias("sin(second)"),
#             (np.pi * pl.col("second") / 60).cos().alias("cos(second)"),
#         )

#         .with_columns(
#             pl.col(pl.Float64).cast(pl.Float32),
#         )

#         .drop("date", "timestamp", "hour", 'minute', 'second')
#     )

#     return df_data

In [None]:
def generate_features(df_data):

    df_data = (
        df_data
        .with_columns(
            pl.col("timestamp").cast(pl.Date).alias("date"),
        )
    )

    df_data = (
        df_data
        .with_columns(
            pl.col("timestamp").dt.second().alias("second"),
            pl.col("timestamp").dt.minute().alias("minute"),
            pl.col("timestamp").dt.hour().alias("hour"),
            #pl.col("timestamp").dt.day().alias("day"),
            pl.col("timestamp").dt.weekday().alias("weekday"),


        )

        # cyclical features encoding https://towardsdatascience.com/cyclical-features-encoding-its-about-time-ce23581845ca
        .with_columns(
            # cyclic hour
            (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
            (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            # cyclic minute
            (np.pi * pl.col("minute") / 60).sin().alias("sin(minute)"),
            (np.pi * pl.col("minute") / 60).cos().alias("cos(minute)"),
            # cyclic second
            (np.pi * pl.col("second") / 60).sin().alias("sin(second)"),
            (np.pi * pl.col("second") / 60).cos().alias("cos(second)"),
        )

        .with_columns(
            pl.col(pl.Float64).cast(pl.Float32),
        )

        .drop("date", "timestamp", "hour", 'minute' 'second')
    )

    # Define sensor columns excluding room-specific and newly created cyclical features
    sensor_columns = [
        "temperature", "humidity", "CO2CosIRValue", "CO2MG811Value",
        "MOX1", "MOX2", "MOX3", "MOX4", "COValue", "weekday", 'cos(minute)'
    ]

    for sensor_column in sensor_columns:
        # Assuming you want to create a lag of 1 period (previous time step)
        df_data = df_data.with_columns(
            pl.col(sensor_column).shift(1).alias(f"{sensor_column}_lag1"),
            pl.col(sensor_column).shift(5).alias(f"{sensor_column}_lag2"),
            pl.col(sensor_column).shift(10).alias(f"{sensor_column}_lag3"),
            pl.col(sensor_column).shift(15).alias(f"{sensor_column}_lag4"),
            pl.col(sensor_column).shift(20).alias(f"{sensor_column}_lag5"),
            pl.col(sensor_column).shift(50).alias(f"{sensor_column}_lag6"),
            pl.col(sensor_column).shift(100).alias(f"{sensor_column}_lag7"),
        )

    return df_data

In [None]:
df_train_features = generate_features(df_data)

df_train_features = df_train_features.to_pandas()
# a little proportion of target values are null

df_train_features['is_weekend'] = df_train_features['weekday'].apply(lambda x: 1 if x >= 5 else 0)


In [None]:
df_train_features

Unnamed: 0,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,Living room,...,weekday_lag6,weekday_lag7,cos(minute)_lag1,cos(minute)_lag2,cos(minute)_lag3,cos(minute)_lag4,cos(minute)_lag5,cos(minute)_lag6,cos(minute)_lag7,is_weekend
0,21.260000,57.740002,109,531,489,705,659,600,122,0,...,,,,,,,,,,0
1,21.190001,58.029999,107,533,494,707,658,603,122,0,...,,,1.000000,,,,,,,0
2,21.110001,57.529999,107,530,491,708,656,596,121,1,...,,,1.000000,,,,,,,0
3,21.120001,57.939999,106,532,489,710,656,597,126,1,...,,,1.000000,,,,,,,0
4,21.059999,58.180000,103,532,490,705,658,597,130,1,...,,,0.998630,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304563,22.049999,55.200001,102,530,441,692,651,583,108,0,...,5.0,5.0,-0.891007,-0.866025,-0.809017,-0.743145,-0.707107,-0.258819,0.707107,1
304564,21.930000,55.130001,104,529,441,694,651,583,108,0,...,5.0,5.0,-0.891007,-0.866025,-0.809017,-0.777146,-0.707107,-0.258819,0.707107,1
304565,22.090000,55.119999,101,529,443,694,651,584,109,0,...,5.0,5.0,-0.913545,-0.866025,-0.838671,-0.777146,-0.707107,-0.258819,0.669131,1
304566,22.040001,55.220001,101,529,441,694,651,585,108,0,...,5.0,5.0,-0.913545,-0.891007,-0.838671,-0.777146,-0.743145,-0.309017,0.669131,1


In [None]:
df_data = df_train_features
split_point = int(len(df_data) * 0.8)

# Split the DataFrame into train and test sets
train_data = df_data.iloc[:split_point]
test_data = df_data.iloc[split_point:]

# Define the columns to be scaled
# columns_to_scale = [
#     "temperature", "humidity", "CO2CosIRValue", "CO2MG811Value",
#     "MOX1", "MOX2", "MOX3", "MOX4", "COValue", 'weekday'
# ]

columns_to_scale_basic = ["temperature", "humidity", "CO2CosIRValue", "CO2MG811Value",
    "MOX1", "MOX2", "MOX3", "MOX4", "COValue", 'weekday']

columns_to_scale = columns_to_scale_basic.copy()

for sensor_column in columns_to_scale_basic:
  columns_to_scale.append(f"{sensor_column}_lag1")
  columns_to_scale.append(f"{sensor_column}_lag2")
  columns_to_scale.append(f"{sensor_column}_lag3")
  columns_to_scale.append(f"{sensor_column}_lag4")
  columns_to_scale.append(f"{sensor_column}_lag5")
  columns_to_scale.append(f"{sensor_column}_lag6")
  columns_to_scale.append(f"{sensor_column}_lag7")

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
train_data_scaled = train_data.copy()
train_data_scaled[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# Transform the test data using the same scaler
test_data_scaled = test_data.copy()
test_data_scaled[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

# Now, train_data_scaled and test_data_scaled have the specified features scaled
# You can proceed with using these datasets for training and testing your models

In [None]:
train_data_scaled

Unnamed: 0,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,Living room,...,weekday_lag6,weekday_lag7,cos(minute)_lag1,cos(minute)_lag2,cos(minute)_lag3,cos(minute)_lag4,cos(minute)_lag5,cos(minute)_lag6,cos(minute)_lag7,is_weekend
0,0.125722,1.321917,0.165218,0.176153,0.116793,0.156828,0.017481,0.284453,0.363826,0,...,,,,,,,,,,0
1,0.090896,1.390922,0.096473,0.235222,0.226897,0.214967,-0.010176,0.352374,0.363826,0,...,,,1.000000,,,,,,,0
2,0.051096,1.271947,0.096473,0.146619,0.160835,0.244037,-0.065489,0.193891,0.326043,1,...,,,1.000000,,,,,,,0
3,0.056071,1.369507,0.062101,0.205688,0.116793,0.302177,-0.065489,0.216531,0.514959,1,...,,,1.000000,,,,,,,0
4,0.026220,1.426615,-0.041017,0.205688,0.138814,0.156828,-0.010176,0.216531,0.666091,1,...,,,0.998630,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243649,0.314775,1.417097,0.027728,-0.650805,0.248918,-1.412948,-1.116435,-1.390940,0.288260,0,...,0.50161,0.501674,0.358368,0.406737,0.50000,0.587785,0.629320,0.933580,-0.866025,1
243650,0.294875,1.414718,0.165218,-0.621271,0.226897,-1.442018,-1.116435,-1.413581,0.288260,0,...,0.50161,0.501674,0.358368,0.406737,0.50000,0.544639,0.629320,0.933580,-0.866025,1
243651,0.260050,1.405200,0.165218,-0.650805,0.270939,-1.442018,-1.088778,-1.436221,0.250477,0,...,0.50161,0.501674,0.309017,0.406737,0.45399,0.544639,0.629320,0.933580,-0.891007,1
243652,0.230199,1.417097,0.096473,-0.680339,0.270939,-1.412948,-1.116435,-1.436221,0.250477,0,...,0.50161,0.501674,0.309017,0.358368,0.45399,0.544639,0.587785,0.913545,-0.891007,1


# Classifier Chains

In [None]:
# For Classifier Chains we select the following order:
# Kitchen -> Living room -> Bedroom -> Hallway -> Bathroom
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']

################### X/Y TRAIN division  ####################
Y_train = train_data_scaled[target_cols]

## Get the feature columns by dropping the target columns from the DataFrame
X_train = train_data_scaled.drop(columns=target_cols, errors='ignore')

################## X/Y TEST division  ######################

Y_test = test_data_scaled[target_cols]
X_test = test_data_scaled.drop(columns=target_cols, errors='ignore')

In [None]:
Y_train.head()

Unnamed: 0,Kitchen,Living room,Bedroom,Hallway,Bathroom
0,0,0,0,0,0
1,0,0,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0


In [None]:
X_train.head()

Unnamed: 0,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,second,...,weekday_lag6,weekday_lag7,cos(minute)_lag1,cos(minute)_lag2,cos(minute)_lag3,cos(minute)_lag4,cos(minute)_lag5,cos(minute)_lag6,cos(minute)_lag7,is_weekend
0,0.125722,1.321917,0.165218,0.176153,0.116793,0.156828,0.017481,0.284453,0.363826,7,...,,,,,,,,,,0
1,0.090896,1.390922,0.096473,0.235222,0.226897,0.214967,-0.010176,0.352374,0.363826,27,...,,,1.0,,,,,,,0
2,0.051096,1.271947,0.096473,0.146619,0.160835,0.244037,-0.065489,0.193891,0.326043,47,...,,,1.0,,,,,,,0
3,0.056071,1.369507,0.062101,0.205688,0.116793,0.302177,-0.065489,0.216531,0.514959,7,...,,,1.0,,,,,,,0
4,0.02622,1.426615,-0.041017,0.205688,0.138814,0.156828,-0.010176,0.216531,0.666091,27,...,,,0.99863,,,,,,,0


In [None]:
# X_train.columns = X_train.columns.astype(str)
# Y_train.columns = Y_train.columns.astype(str)
# X_test.columns = X_test.columns.astype(str)
# Y_test.columns = Y_test.columns.astype(str)


In [None]:
Y_train_np = np.array(Y_train)

## Simple Logistic Regression no Cross Validation

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# Initialize an empty list to store the trained models
classifiers = []

# Initialize the feature set for the chain, starting with the original features
X_chain = X_train.copy()

# Convert Y_train to a numpy array if it's a DataFrame
Y_train_np = Y_train.values if isinstance(Y_train, pd.DataFrame) else Y_train
Y_train_np



array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [None]:
Y_train_np.shape[1]

5

In [None]:
Y_train_np[:, 1]

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
for i in range(Y_train_np.shape[1]):
    # Train the logistic regression model for the current label
    clf = LogisticRegression()
    clf.fit(X_chain, Y_train_np[:, i])

    # Store the trained model
    classifiers.append(clf)

    # Add the predictions as a feature for the next classifier in the chain
    # This is a simplification, in practice you would use cross-validation predictions here

    predictions = clf.predict_proba(X_chain)[:, 1]

    # Create a new column name for the predictions
    # Make sure it's a string to keep consistency
    new_column_name = f'pred_{i}'

    # Add the predictions as a column to X_chain
    X_chain[new_column_name] = predictions

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
X_chain

Unnamed: 0,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,weekday,...,cos(hour),sin(minute),cos(minute),sin(second),cos(second),pred_0,pred_1,pred_2,pred_3,pred_4
0,0.125722,1.321917,0.165218,0.176153,0.116793,0.156828,0.017481,0.284453,0.363826,-0.006871,...,1.000000,0.000000,1.000000,0.358368,0.933580,0.060208,0.071551,0.073035,0.064308,0.083805
1,0.090896,1.390922,0.096473,0.235222,0.226897,0.214967,-0.010176,0.352374,0.363826,-0.006871,...,1.000000,0.000000,1.000000,0.987688,0.156434,0.058065,0.077024,0.076174,0.060577,0.070527
2,0.051096,1.271947,0.096473,0.146619,0.160835,0.244037,-0.065489,0.193891,0.326043,-0.006871,...,1.000000,0.000000,1.000000,0.629320,-0.777146,0.044005,0.044207,0.051681,0.038688,0.053155
3,0.056071,1.369507,0.062101,0.205688,0.116793,0.302177,-0.065489,0.216531,0.514959,-0.006871,...,1.000000,0.052336,0.998630,0.358368,0.933580,0.051755,0.054560,0.064306,0.045767,0.060644
4,0.026220,1.426615,-0.041017,0.205688,0.138814,0.156828,-0.010176,0.216531,0.666091,-0.006871,...,1.000000,0.052336,0.998630,0.987688,0.156434,0.059631,0.063186,0.070627,0.054135,0.073917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243649,0.314775,1.417097,0.027728,-0.650805,0.248918,-1.412948,-1.116435,-1.390940,0.288260,0.501545,...,0.965926,0.933580,0.358368,0.707107,-0.707107,0.031097,0.008830,0.021111,0.031762,0.055466
243650,0.294875,1.414718,0.165218,-0.621271,0.226897,-1.442018,-1.116435,-1.413581,0.288260,0.501545,...,0.965926,0.951057,0.309017,0.258819,0.965926,0.036508,0.010379,0.024631,0.038266,0.068065
243651,0.260050,1.405200,0.165218,-0.650805,0.270939,-1.442018,-1.088778,-1.436221,0.250477,0.501545,...,0.965926,0.951057,0.309017,0.965926,0.258819,0.034701,0.008909,0.022325,0.035588,0.068732
243652,0.230199,1.417097,0.096473,-0.680339,0.270939,-1.412948,-1.116435,-1.436221,0.250477,0.501545,...,0.965926,0.951057,0.309017,0.707107,-0.707107,0.030316,0.007795,0.020248,0.030277,0.056479


In [None]:
from sklearn.metrics import roc_auc_score
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']

AUC1 = roc_auc_score(Y_train[target_cols[0]], X_chain['pred_0'])
AUC2 = roc_auc_score(Y_train[target_cols[1]], X_chain['pred_1'])
AUC3 = roc_auc_score(Y_train[target_cols[2]], X_chain['pred_2'])
AUC4 = roc_auc_score(Y_train[target_cols[3]], X_chain['pred_3'])
AUC5 = roc_auc_score(Y_train[target_cols[4]], X_chain['pred_4'])

In [None]:
print(f'AUC score for Kitchen = {AUC1}')
print(f'AUC score for LivingR = {AUC2}')
print(f'AUC score for Bedroom = {AUC3}')
print(f'AUC score for Hallway = {AUC4}')
print(f'AUC score for Bathroom= {AUC5}')

AUC score for Kitchen = 0.6915479107873295
AUC score for LivingR = 0.79727444937801
AUC score for Bedroom = 0.8367296848805226
AUC score for Hallway = 0.7978368514294711
AUC score for Bathroom= 0.7139260881833897


In [None]:
print(f'Mean AUC score on Training Data = {np.average([AUC1, AUC2, AUC3, AUC4, AUC5])}')

Mean AUC score on Training Data = 0.7674629969317446


In [None]:
X_chain_test = X_test.copy()

for i, clf in enumerate(classifiers):
    # Predict the probability of activity for the current room
    predictions = clf.predict_proba(X_chain_test)[:, 1]

    # Create a new column name for the predictions
    new_column_name = f'pred_{i}'

    # Add the predictions as a column to X_chain_test
    X_chain_test[new_column_name] = predictions

    # Convert all column names to strings to avoid the TypeError
    X_chain_test.columns = X_chain_test.columns.astype(str)


In [None]:
X_chain_test

Unnamed: 0,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,weekday,...,cos(hour),sin(minute),cos(minute),sin(second),cos(second),pred_0,pred_1,pred_2,pred_3,pred_4
243654,0.274975,1.445651,0.096473,-0.621271,0.248918,-1.383878,-1.088778,-1.390940,0.174911,0.501545,...,0.965926,0.965926,0.258819,0.965926,0.258819,0.032684,0.008715,0.022330,0.033317,0.058808
243655,0.235174,1.379025,0.027728,-0.650805,0.226897,-1.442018,-1.061122,-1.413581,0.212694,0.501545,...,0.965926,0.965926,0.258819,0.707107,-0.707107,0.031103,0.007666,0.020297,0.030852,0.059068
243656,0.319751,1.379025,0.096473,-0.650805,0.182856,-1.500158,-1.144091,-1.458862,0.212694,0.501545,...,0.965926,0.978148,0.207912,0.258819,0.965926,0.034394,0.009118,0.022948,0.036313,0.064447
243657,0.314775,1.305260,0.027728,-0.650805,0.248918,-1.442018,-1.144091,-1.436221,0.250477,0.501545,...,0.965926,0.978148,0.207912,0.965926,0.258819,0.032228,0.008306,0.021004,0.031829,0.060075
243658,0.289900,1.333815,0.062101,-0.562202,0.204876,-1.412948,-1.061122,-1.436221,0.212694,0.501545,...,0.965926,0.978148,0.207912,0.707107,-0.707107,0.030255,0.006959,0.019238,0.028570,0.058261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304563,0.518754,0.717521,-0.075390,0.146619,-0.940205,-0.221081,-0.203771,-0.100435,-0.165138,0.501545,...,0.707107,0.453990,-0.891007,0.777146,-0.629320,0.041931,0.027564,0.044901,0.034072,0.072387
304564,0.459054,0.700865,-0.006645,0.117085,-0.940205,-0.162942,-0.203771,-0.100435,-0.165138,0.501545,...,0.707107,0.406737,-0.913545,0.156434,0.987688,0.044767,0.028596,0.048200,0.036319,0.078954
304565,0.538655,0.698485,-0.109763,0.117085,-0.896163,-0.162942,-0.203771,-0.077795,-0.127354,0.501545,...,0.707107,0.406737,-0.913545,0.933580,0.358368,0.043594,0.028681,0.045779,0.035408,0.076208
304566,0.513780,0.722280,-0.109763,0.117085,-0.940205,-0.162942,-0.203771,-0.055154,-0.165138,0.501545,...,0.707107,0.406737,-0.913545,0.777146,-0.629320,0.040741,0.028525,0.045242,0.033436,0.068346


In [None]:
from sklearn.metrics import roc_auc_score
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']

AUC1 = roc_auc_score(Y_test[target_cols[0]], X_chain_test['pred_0'])
AUC2 = roc_auc_score(Y_test[target_cols[1]], X_chain_test['pred_1'])
AUC3 = roc_auc_score(Y_test[target_cols[2]], X_chain_test['pred_2'])
AUC4 = roc_auc_score(Y_test[target_cols[3]], X_chain_test['pred_3'])
AUC5 = roc_auc_score(Y_test[target_cols[4]], X_chain_test['pred_4'])

In [None]:
print(f'AUC score for Kitchen = {AUC1}')
print(f'AUC score for LivingR = {AUC2}')
print(f'AUC score for Bedroom = {AUC3}')
print(f'AUC score for Hallway = {AUC4}')
print(f'AUC score for Bathroom= {AUC5}')

AUC score for Kitchen = 0.76521024637227
AUC score for LivingR = 0.5753315705531772
AUC score for Bedroom = 0.6011211030043493
AUC score for Hallway = 0.681755653450739
AUC score for Bathroom= 0.5661742605827644


In [None]:
print(f'Mean AUC score on Training Data = {np.average([AUC1, AUC2, AUC3, AUC4, AUC5])}')

Mean AUC score on Training Data = 0.63791856679266


## Simple LightGBM no Cross Validation no Optuna





In [None]:
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import roc_auc_score
import numpy as np
import optuna

# Initialize an empty list to store the trained models
classifiers = []

# Initialize the feature set for the chain, starting with the original features
X_chain = X_train.copy()

# Convert Y_train to a numpy array if it's a DataFrame
Y_train_np = Y_train.values if isinstance(Y_train, pd.DataFrame) else Y_train
Y_train_np



array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [None]:
for i in range(Y_train_np.shape[1]):
    # Train the lightGBM model for the current label
    clf = lgb.LGBMClassifier(max_depth = 7)
    clf.fit(X_chain, Y_train_np[:, i])

    # Store the trained model
    classifiers.append(clf)

    # Add the predictions as a feature for the next classifier in the chain
    # This is a simplification, in practice you would use cross-validation predictions here

    predictions = clf.predict_proba(X_chain)[:, 1]

    # Create a new column name for the predictions
    # Make sure it's a string to keep consistency
    new_column_name = f'pred_{i}'

    # Add the predictions as a column to X_chain
    X_chain[new_column_name] = predictions

[LightGBM] [Info] Number of positive: 22045, number of negative: 221609
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2152
[LightGBM] [Info] Number of data points in the train set: 243654, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090477 -> initscore=-2.307829
[LightGBM] [Info] Start training from score -2.307829
[LightGBM] [Info] Number of positive: 10002, number of negative: 233652
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2407
[LightGBM] [Info] Number of data points in the train set: 243654, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.041050 -> initscore=-3.15104

In [None]:
from sklearn.metrics import roc_auc_score
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']

AUC1 = roc_auc_score(Y_train[target_cols[0]], X_chain['pred_0'])
AUC2 = roc_auc_score(Y_train[target_cols[1]], X_chain['pred_1'])
AUC3 = roc_auc_score(Y_train[target_cols[2]], X_chain['pred_2'])
AUC4 = roc_auc_score(Y_train[target_cols[3]], X_chain['pred_3'])
AUC5 = roc_auc_score(Y_train[target_cols[4]], X_chain['pred_4'])

In [None]:
print(f'AUC score for Kitchen = {AUC1}')
print(f'AUC score for LivingR = {AUC2}')
print(f'AUC score for Bedroom = {AUC3}')
print(f'AUC score for Hallway = {AUC4}')
print(f'AUC score for Bathroom= {AUC5}')

AUC score for Kitchen = 0.9444164519803693
AUC score for LivingR = 0.9223216453982072
AUC score for Bedroom = 0.9486367936562481
AUC score for Hallway = 0.930256296820576
AUC score for Bathroom= 0.9121560777284088


In [None]:
print(f'Mean AUC score on Training Data = {np.average([AUC1, AUC2, AUC3, AUC4, AUC5])}')

Mean AUC score on Training Data = 0.931557453116762


In [None]:
X_chain_test = X_test.copy()

for i, clf in enumerate(classifiers):
    # Predict the probability of activity for the current room
    predictions = clf.predict_proba(X_chain_test)[:, 1]

    # Create a new column name for the predictions
    new_column_name = f'pred_{i}'

    # Add the predictions as a column to X_chain_test
    X_chain_test[new_column_name] = predictions

    # Convert all column names to strings to avoid the TypeError
    X_chain_test.columns = X_chain_test.columns.astype(str)




In [None]:
from sklearn.metrics import roc_auc_score
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']

AUC1 = roc_auc_score(Y_test[target_cols[0]], X_chain_test['pred_0'])
AUC2 = roc_auc_score(Y_test[target_cols[1]], X_chain_test['pred_1'])
AUC3 = roc_auc_score(Y_test[target_cols[2]], X_chain_test['pred_2'])
AUC4 = roc_auc_score(Y_test[target_cols[3]], X_chain_test['pred_3'])
AUC5 = roc_auc_score(Y_test[target_cols[4]], X_chain_test['pred_4'])

In [None]:
print(f'AUC score for Kitchen = {AUC1}')
print(f'AUC score for LivingR = {AUC2}')
print(f'AUC score for Bedroom = {AUC3}')
print(f'AUC score for Hallway = {AUC4}')
print(f'AUC score for Bathroom= {AUC5}')

AUC score for Kitchen = 0.7638501325413404
AUC score for LivingR = 0.6835542756641703
AUC score for Bedroom = 0.6234832803983223
AUC score for Hallway = 0.6567181552787785
AUC score for Bathroom= 0.597235522579228


In [None]:
print(f'Mean AUC score on Testing Data = {np.average([AUC1, AUC2, AUC3, AUC4, AUC5])}')

Mean AUC score on Training Data = 0.6649682732923679


## LightGBM with Optuna


In [None]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit

def lgb_objective_production(trial, X_train, y_train):
    params = {
        'n_estimators': 300,
        'verbose': -1,
        'random_state': 42,
        'objective': 'binary',
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('lambda_l1', 1e-4, 10.0),
        'reg_lambda': trial.suggest_float('lambda_l2', 1e-4, 10.0),
        'min_child_samples': trial.suggest_int('min_data_in_leaf', 4, 256),
        'max_depth': trial.suggest_int('max_depth', 5, 16),
        'max_bin': trial.suggest_int('max_bin', 32, 1024),
    }

    model = lgb.LGBMClassifier(**params)
    tscv = TimeSeriesSplit(n_splits=5)
    scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring='roc_auc')
    return np.mean(scores)

def optimize_hyperparameters(X_train, y_train):
    study = optuna.create_study(direction='maximize')
    objective = lambda trial: lgb_objective_production(trial, X_train, y_train)
    study.optimize(objective, n_trials=10)  # Adjust the number of trials as needed
    print("Best trial:")
    print(study.best_trial)
    return study.best_params

# Define the columns order for the target based on your strategy
target_cols = ['Kitchen', 'Living room', 'Bedroom', 'Hallway', 'Bathroom']

# Storage for the best parameters for each target
best_params_per_target = {}

# Optimize hyperparameters for each target
for target in target_cols:
    print(f"Optimizing for {target}")
    best_params = optimize_hyperparameters(X_train, Y_train[target])
    best_params_per_target[target] = best_params

# Now train the final classifier chain with optimized hyperparameters
classifiers = []
X_chain = X_train.copy()

for i, target in enumerate(target_cols):
    params = best_params_per_target[target]
    clf = lgb.LGBMClassifier(**params)
    clf.fit(X_chain, Y_train[target])

    # Store the trained model
    classifiers.append(clf)

    # Add the predictions as a feature for the next classifier in the chain
    predictions = clf.predict_proba(X_chain)[:, 1]
    X_chain[f'pred_{target}'] = predictions

# Now you can evaluate your classifier chain as before


[I 2024-02-03 20:52:46,334] A new study created in memory with name: no-name-9b72fdd7-7efd-415a-a112-77945c823b61


Optimizing for Kitchen


[I 2024-02-03 20:54:30,918] Trial 0 finished with value: 0.7043450873444186 and parameters: {'learning_rate': 0.018599146446765354, 'colsample_bytree': 0.6352776291714152, 'colsample_bynode': 0.6779895699423257, 'lambda_l1': 7.845585404085856, 'lambda_l2': 7.801034699663641, 'min_data_in_leaf': 20, 'max_depth': 9, 'max_bin': 364}. Best is trial 0 with value: 0.7043450873444186.
[I 2024-02-03 20:56:14,591] Trial 1 finished with value: 0.7074246632390002 and parameters: {'learning_rate': 0.05137985834330245, 'colsample_bytree': 0.700172652605222, 'colsample_bynode': 0.6538646272592009, 'lambda_l1': 7.769699251815669, 'lambda_l2': 5.9897137982075614, 'min_data_in_leaf': 150, 'max_depth': 12, 'max_bin': 615}. Best is trial 1 with value: 0.7074246632390002.
[I 2024-02-03 20:57:58,115] Trial 2 finished with value: 0.7167300370904484 and parameters: {'learning_rate': 0.06198412809074967, 'colsample_bytree': 0.8722703653329802, 'colsample_bynode': 0.7369772402241518, 'lambda_l1': 6.54781337170

Best trial:
FrozenTrial(number=2, state=TrialState.COMPLETE, values=[0.7167300370904484], datetime_start=datetime.datetime(2024, 2, 3, 20, 56, 14, 593527), datetime_complete=datetime.datetime(2024, 2, 3, 20, 57, 58, 115421), params={'learning_rate': 0.06198412809074967, 'colsample_bytree': 0.8722703653329802, 'colsample_bynode': 0.7369772402241518, 'lambda_l1': 6.547813371706801, 'lambda_l2': 6.8707764962528906, 'min_data_in_leaf': 39, 'max_depth': 7, 'max_bin': 499}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.1, log=False, low=0.005, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.3, step=None), 'colsample_bynode': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'lambda_l1': FloatDistribution(high=10.0, log=False, low=0.0001, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=False, low=0.0001, step=None), 'min_data_in_leaf': IntDistribution(high=256, log=False, low=4,

[I 2024-02-03 21:10:35,435] Trial 0 finished with value: 0.6892372678111822 and parameters: {'learning_rate': 0.09408430622725157, 'colsample_bytree': 0.8916520957785858, 'colsample_bynode': 0.578446480666374, 'lambda_l1': 8.065801718205977, 'lambda_l2': 3.9265073019954384, 'min_data_in_leaf': 103, 'max_depth': 14, 'max_bin': 60}. Best is trial 0 with value: 0.6892372678111822.
[I 2024-02-03 21:11:40,097] Trial 1 finished with value: 0.6725226464893689 and parameters: {'learning_rate': 0.016845407236813725, 'colsample_bytree': 0.34850482929538024, 'colsample_bynode': 0.60977384111607, 'lambda_l1': 4.272299166172777, 'lambda_l2': 6.942180879413256, 'min_data_in_leaf': 239, 'max_depth': 11, 'max_bin': 156}. Best is trial 0 with value: 0.6892372678111822.
[I 2024-02-03 21:13:23,403] Trial 2 finished with value: 0.6786865068124157 and parameters: {'learning_rate': 0.09442821622191971, 'colsample_bytree': 0.8190742729714353, 'colsample_bynode': 0.8797219011271985, 'lambda_l1': 5.10637714351

Best trial:
FrozenTrial(number=0, state=TrialState.COMPLETE, values=[0.6892372678111822], datetime_start=datetime.datetime(2024, 2, 3, 21, 9, 13, 79317), datetime_complete=datetime.datetime(2024, 2, 3, 21, 10, 35, 435353), params={'learning_rate': 0.09408430622725157, 'colsample_bytree': 0.8916520957785858, 'colsample_bynode': 0.578446480666374, 'lambda_l1': 8.065801718205977, 'lambda_l2': 3.9265073019954384, 'min_data_in_leaf': 103, 'max_depth': 14, 'max_bin': 60}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.1, log=False, low=0.005, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.3, step=None), 'colsample_bynode': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'lambda_l1': FloatDistribution(high=10.0, log=False, low=0.0001, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=False, low=0.0001, step=None), 'min_data_in_leaf': IntDistribution(high=256, log=False, low=4, s

[I 2024-02-03 21:22:50,419] Trial 0 finished with value: 0.7255808973671216 and parameters: {'learning_rate': 0.05518845506174439, 'colsample_bytree': 0.5569267705649534, 'colsample_bynode': 0.6354168549949175, 'lambda_l1': 1.6571744262557995, 'lambda_l2': 7.921290189763891, 'min_data_in_leaf': 195, 'max_depth': 11, 'max_bin': 627}. Best is trial 0 with value: 0.7255808973671216.
[I 2024-02-03 21:23:46,347] Trial 1 finished with value: 0.7190868756492902 and parameters: {'learning_rate': 0.07520126897999327, 'colsample_bytree': 0.3396260219542454, 'colsample_bynode': 0.7318624375863715, 'lambda_l1': 3.081143379636005, 'lambda_l2': 0.9251343439839868, 'min_data_in_leaf': 161, 'max_depth': 9, 'max_bin': 626}. Best is trial 0 with value: 0.7255808973671216.
[I 2024-02-03 21:25:43,038] Trial 2 finished with value: 0.725099335867738 and parameters: {'learning_rate': 0.01897511081765667, 'colsample_bytree': 0.8298805125422861, 'colsample_bynode': 0.5563827873129787, 'lambda_l1': 6.0375500310

Best trial:
FrozenTrial(number=8, state=TrialState.COMPLETE, values=[0.7290861926160102], datetime_start=datetime.datetime(2024, 2, 3, 21, 31, 58, 946919), datetime_complete=datetime.datetime(2024, 2, 3, 21, 33, 40, 188409), params={'learning_rate': 0.03506093115895017, 'colsample_bytree': 0.8817664314308815, 'colsample_bynode': 0.641803207460583, 'lambda_l1': 1.6934489777933779, 'lambda_l2': 7.269856936057833, 'min_data_in_leaf': 184, 'max_depth': 9, 'max_bin': 349}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.1, log=False, low=0.005, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.3, step=None), 'colsample_bynode': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'lambda_l1': FloatDistribution(high=10.0, log=False, low=0.0001, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=False, low=0.0001, step=None), 'min_data_in_leaf': IntDistribution(high=256, log=False, low=4,

[I 2024-02-03 21:36:33,705] Trial 0 finished with value: 0.6840916488818862 and parameters: {'learning_rate': 0.0742757809415729, 'colsample_bytree': 0.7356487128984555, 'colsample_bynode': 0.8246739081791972, 'lambda_l1': 1.1734855061208083, 'lambda_l2': 9.09956498984056, 'min_data_in_leaf': 238, 'max_depth': 8, 'max_bin': 228}. Best is trial 0 with value: 0.6840916488818862.
[I 2024-02-03 21:38:15,016] Trial 1 finished with value: 0.6933263450388634 and parameters: {'learning_rate': 0.03281557230048624, 'colsample_bytree': 0.9288037693789355, 'colsample_bynode': 0.5028488912602349, 'lambda_l1': 2.8131981812557894, 'lambda_l2': 2.7459497929349945, 'min_data_in_leaf': 192, 'max_depth': 15, 'max_bin': 302}. Best is trial 1 with value: 0.6933263450388634.
[I 2024-02-03 21:39:34,858] Trial 2 finished with value: 0.6888508583673019 and parameters: {'learning_rate': 0.08832856906084821, 'colsample_bytree': 0.8487366374975538, 'colsample_bynode': 0.804128939576451, 'lambda_l1': 2.11387164122

Best trial:
FrozenTrial(number=3, state=TrialState.COMPLETE, values=[0.700532265005161], datetime_start=datetime.datetime(2024, 2, 3, 21, 39, 34, 860262), datetime_complete=datetime.datetime(2024, 2, 3, 21, 40, 40, 66587), params={'learning_rate': 0.06980049342544237, 'colsample_bytree': 0.40898655648040316, 'colsample_bynode': 0.666483532895658, 'lambda_l1': 9.29572107629556, 'lambda_l2': 1.263103832080631, 'min_data_in_leaf': 75, 'max_depth': 12, 'max_bin': 161}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.1, log=False, low=0.005, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.3, step=None), 'colsample_bynode': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'lambda_l1': FloatDistribution(high=10.0, log=False, low=0.0001, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=False, low=0.0001, step=None), 'min_data_in_leaf': IntDistribution(high=256, log=False, low=4, st

[I 2024-02-03 21:50:19,485] Trial 0 finished with value: 0.6159428062024036 and parameters: {'learning_rate': 0.04346107316882029, 'colsample_bytree': 0.6742684166658582, 'colsample_bynode': 0.5597204739300955, 'lambda_l1': 6.800460232560322, 'lambda_l2': 2.0500529687372238, 'min_data_in_leaf': 220, 'max_depth': 10, 'max_bin': 471}. Best is trial 0 with value: 0.6159428062024036.
[I 2024-02-03 21:51:30,320] Trial 1 finished with value: 0.6234914177364864 and parameters: {'learning_rate': 0.032205922867640985, 'colsample_bytree': 0.4987011998600954, 'colsample_bynode': 0.8699669814150692, 'lambda_l1': 1.4540618505176355, 'lambda_l2': 6.21152322047457, 'min_data_in_leaf': 159, 'max_depth': 12, 'max_bin': 398}. Best is trial 1 with value: 0.6234914177364864.
[I 2024-02-03 21:52:49,341] Trial 2 finished with value: 0.6169360853189421 and parameters: {'learning_rate': 0.072895789783145, 'colsample_bytree': 0.5547377152894415, 'colsample_bynode': 0.6312594285858004, 'lambda_l1': 5.8929587961

Best trial:
FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.6234914177364864], datetime_start=datetime.datetime(2024, 2, 3, 21, 50, 19, 487503), datetime_complete=datetime.datetime(2024, 2, 3, 21, 51, 30, 320126), params={'learning_rate': 0.032205922867640985, 'colsample_bytree': 0.4987011998600954, 'colsample_bynode': 0.8699669814150692, 'lambda_l1': 1.4540618505176355, 'lambda_l2': 6.21152322047457, 'min_data_in_leaf': 159, 'max_depth': 12, 'max_bin': 398}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.1, log=False, low=0.005, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.3, step=None), 'colsample_bynode': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'lambda_l1': FloatDistribution(high=10.0, log=False, low=0.0001, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=False, low=0.0001, step=None), 'min_data_in_leaf': IntDistribution(high=256, log=False, low=

In [None]:
X_chain

Unnamed: 0,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,weekday,...,cos(hour),sin(minute),cos(minute),sin(second),cos(second),pred_Kitchen,pred_Living room,pred_Bedroom,pred_Hallway,pred_Bathroom
0,0.125722,1.321917,0.165218,0.176153,0.116793,0.156828,0.017481,0.284453,0.363826,-0.006871,...,1.000000,0.000000,1.000000,0.358368,0.933580,0.037286,0.099591,0.048346,0.182040,0.301921
1,0.090896,1.390922,0.096473,0.235222,0.226897,0.214967,-0.010176,0.352374,0.363826,-0.006871,...,1.000000,0.000000,1.000000,0.987688,0.156434,0.036670,0.113938,0.046502,0.156036,0.324236
2,0.051096,1.271947,0.096473,0.146619,0.160835,0.244037,-0.065489,0.193891,0.326043,-0.006871,...,1.000000,0.000000,1.000000,0.629320,-0.777146,0.035984,0.111912,0.046346,0.159710,0.306720
3,0.056071,1.369507,0.062101,0.205688,0.116793,0.302177,-0.065489,0.216531,0.514959,-0.006871,...,1.000000,0.052336,0.998630,0.358368,0.933580,0.039808,0.175499,0.091963,0.321627,0.499131
4,0.026220,1.426615,-0.041017,0.205688,0.138814,0.156828,-0.010176,0.216531,0.666091,-0.006871,...,1.000000,0.052336,0.998630,0.987688,0.156434,0.039884,0.154752,0.057139,0.218136,0.410411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243649,0.314775,1.417097,0.027728,-0.650805,0.248918,-1.412948,-1.116435,-1.390940,0.288260,0.501545,...,0.965926,0.933580,0.358368,0.707107,-0.707107,0.038239,0.010096,0.018544,0.026741,0.021030
243650,0.294875,1.414718,0.165218,-0.621271,0.226897,-1.442018,-1.116435,-1.413581,0.288260,0.501545,...,0.965926,0.951057,0.309017,0.258819,0.965926,0.039940,0.010157,0.019067,0.027235,0.020364
243651,0.260050,1.405200,0.165218,-0.650805,0.270939,-1.442018,-1.088778,-1.436221,0.250477,0.501545,...,0.965926,0.951057,0.309017,0.965926,0.258819,0.038812,0.010096,0.020545,0.026801,0.020336
243652,0.230199,1.417097,0.096473,-0.680339,0.270939,-1.412948,-1.116435,-1.436221,0.250477,0.501545,...,0.965926,0.951057,0.309017,0.707107,-0.707107,0.039196,0.010096,0.020170,0.026966,0.020457


In [None]:
from sklearn.metrics import roc_auc_score
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']
AUC_scores = []
for target in target_cols:
  AUC_scores.append(roc_auc_score(Y_train[target], X_chain[f'pred_{target}']))

In [None]:
print(f'AUC score for Kitchen = {AUC_scores[0]}')
print(f'AUC score for LivingR = {AUC_scores[1]}')
print(f'AUC score for Bedroom = {AUC_scores[2]}')
print(f'AUC score for Hallway = {AUC_scores[3]}')
print(f'AUC score for Bathroom= {AUC_scores[4]}')

AUC score for Kitchen = 0.9427004543578718
AUC score for LivingR = 0.9141837456041224
AUC score for Bedroom = 0.9195742638121147
AUC score for Hallway = 0.9164507698037545
AUC score for Bathroom= 0.8929855902974202


In [None]:
print(f'Mean AUC score on Training Data = {np.average(AUC_scores)}')

Mean AUC score on Training Data = 0.9171789647750567


In [None]:
X_chain_test = X_test.copy()
X_chain_test

Unnamed: 0,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,second,...,weekday_lag6,weekday_lag7,cos(minute)_lag1,cos(minute)_lag2,cos(minute)_lag3,cos(minute)_lag4,cos(minute)_lag5,cos(minute)_lag6,cos(minute)_lag7,is_weekend
243654,0.274975,1.445651,0.096473,-0.621271,0.248918,-1.383878,-1.088778,-1.390940,0.174911,25,...,0.50161,0.501674,0.258819,0.358368,0.406737,0.500000,0.587785,0.913545,-0.913545,1
243655,0.235174,1.379025,0.027728,-0.650805,0.226897,-1.442018,-1.061122,-1.413581,0.212694,45,...,0.50161,0.501674,0.258819,0.309017,0.406737,0.500000,0.544639,0.891007,-0.913545,1
243656,0.319751,1.379025,0.096473,-0.650805,0.182856,-1.500158,-1.144091,-1.458862,0.212694,5,...,0.50161,0.501674,0.258819,0.309017,0.406737,0.453990,0.544639,0.891007,-0.913545,1
243657,0.314775,1.305260,0.027728,-0.650805,0.248918,-1.442018,-1.144091,-1.436221,0.250477,25,...,0.50161,0.501674,0.207912,0.309017,0.358368,0.453990,0.544639,0.891007,-0.933580,1
243658,0.289900,1.333815,0.062101,-0.562202,0.204876,-1.412948,-1.061122,-1.436221,0.212694,45,...,0.50161,0.501674,0.207912,0.258819,0.358368,0.453990,0.500000,0.866025,-0.933580,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304563,0.518754,0.717521,-0.075390,0.146619,-0.940205,-0.221081,-0.203771,-0.100435,-0.165138,43,...,0.50161,0.501674,-0.891007,-0.866025,-0.809017,-0.743145,-0.707107,-0.258819,0.707107,1
304564,0.459054,0.700865,-0.006645,0.117085,-0.940205,-0.162942,-0.203771,-0.100435,-0.165138,3,...,0.50161,0.501674,-0.891007,-0.866025,-0.809017,-0.777146,-0.707107,-0.258819,0.707107,1
304565,0.538655,0.698485,-0.109763,0.117085,-0.896163,-0.162942,-0.203771,-0.077795,-0.127354,23,...,0.50161,0.501674,-0.913545,-0.866025,-0.838671,-0.777146,-0.707107,-0.258819,0.669131,1
304566,0.513780,0.722280,-0.109763,0.117085,-0.940205,-0.162942,-0.203771,-0.055154,-0.165138,43,...,0.50161,0.501674,-0.913545,-0.891007,-0.838671,-0.777146,-0.743145,-0.309017,0.669131,1


In [None]:
X_chain_test = X_test.copy()

for i, clf in enumerate(classifiers):
    # Predict the probability of activity for the current room
    predictions = clf.predict_proba(X_chain_test)[:, 1]

    # Create a new column name for the predictions
    new_column_name = f'pred_{i}'

    # Add the predictions as a column to X_chain_test
    X_chain_test[new_column_name] = predictions

    # Convert all column names to strings to avoid the TypeError
    X_chain_test.columns = X_chain_test.columns.astype(str)




In [None]:
from sklearn.metrics import roc_auc_score
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']

AUC1 = roc_auc_score(Y_test[target_cols[0]], X_chain_test['pred_0'])
AUC2 = roc_auc_score(Y_test[target_cols[1]], X_chain_test['pred_1'])
AUC3 = roc_auc_score(Y_test[target_cols[2]], X_chain_test['pred_2'])
AUC4 = roc_auc_score(Y_test[target_cols[3]], X_chain_test['pred_3'])
AUC5 = roc_auc_score(Y_test[target_cols[4]], X_chain_test['pred_4'])

In [None]:
print(f'AUC score for Kitchen = {AUC1}')
print(f'AUC score for LivingR = {AUC2}')
print(f'AUC score for Bedroom = {AUC3}')
print(f'AUC score for Hallway = {AUC4}')
print(f'AUC score for Bathroom= {AUC5}')

AUC score for Kitchen = 0.7963153566272181
AUC score for LivingR = 0.6763176741567446
AUC score for Bedroom = 0.6395634304201536
AUC score for Hallway = 0.7066488844559091
AUC score for Bathroom= 0.6237331767531507


In [None]:
print(f'Mean AUC score on Testing Data = {np.average([AUC1, AUC2, AUC3, AUC4, AUC5])}')

Mean AUC score on Testing Data = 0.6885157044826352


In [None]:
classifiers

[LGBMClassifier(colsample_bynode=0.7369772402241518,
                colsample_bytree=0.8722703653329802, lambda_l1=6.547813371706801,
                lambda_l2=6.8707764962528906, learning_rate=0.06198412809074967,
                max_bin=499, max_depth=7, min_data_in_leaf=39),
 LGBMClassifier(colsample_bynode=0.578446480666374,
                colsample_bytree=0.8916520957785858, lambda_l1=8.065801718205977,
                lambda_l2=3.9265073019954384, learning_rate=0.09408430622725157,
                max_bin=60, max_depth=14, min_data_in_leaf=103),
 LGBMClassifier(colsample_bynode=0.641803207460583,
                colsample_bytree=0.8817664314308815,
                lambda_l1=1.6934489777933779, lambda_l2=7.269856936057833,
                learning_rate=0.03506093115895017, max_bin=349, max_depth=9,
                min_data_in_leaf=184),
 LGBMClassifier(colsample_bynode=0.666483532895658,
                colsample_bytree=0.40898655648040316, lambda_l1=9.29572107629556,
          

## Testing Results from LightGBM Optuna






In [None]:
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import roc_auc_score
import numpy as np
import optuna



# Initialize an empty list to store the trained models
LGBMParameters = [lgb.LGBMClassifier(colsample_bynode=0.5679743345459976,
                colsample_bytree=0.6699157715871027, lambda_l1=9.736920901171917,
                lambda_l2=9.340589420463447, learning_rate=0.02192475957865045,
                max_bin=84, max_depth=13, min_data_in_leaf=242),
 lgb.LGBMClassifier(colsample_bynode=0.5218452498818968,
                colsample_bytree=0.8288118102964039, lambda_l1=3.589807097746339,
                lambda_l2=8.61057796503742, learning_rate=0.04131388355266522,
                max_bin=512, max_depth=14, min_data_in_leaf=67),
 lgb.LGBMClassifier(colsample_bynode=0.589868224230489,
                colsample_bytree=0.44550629374414064,
                lambda_l1=2.109285768873487, lambda_l2=6.6149161167086765,
                learning_rate=0.020381173642224748, max_bin=1004, max_depth=14,
                min_data_in_leaf=211),
 lgb.LGBMClassifier(colsample_bynode=0.6097967475299534,
                colsample_bytree=0.48162861247897215,
                lambda_l1=3.996404753838309, lambda_l2=6.827336934610966,
                learning_rate=0.025236007789147835, max_bin=609, max_depth=14,
                min_data_in_leaf=5),
 lgb.LGBMClassifier(colsample_bynode=0.5886978031040463,
                colsample_bytree=0.5381156771169265, lambda_l1=3.000232038891787,
                lambda_l2=3.6422957171828614, learning_rate=0.05494538921788841,
                max_bin=998, max_depth=7, min_data_in_leaf=65)]

classifiers = []
# Initialize the feature set for the chain, starting with the original features
X_chain = X_train.copy()

# Convert Y_train to a numpy array if it's a DataFrame
Y_train_np = Y_train.values if isinstance(Y_train, pd.DataFrame) else Y_train

In [None]:
for i in range(Y_train_np.shape[1]):
    # Train the lightGBM model for the current label
    clf = LGBMParameters[i]
    clf.fit(X_chain, Y_train_np[:, i])

    # Store the trained model
    classifiers.append(clf)

    # Add the predictions as a feature for the next classifier in the chain
    # This is a simplification, in practice you would use cross-validation predictions here

    predictions = clf.predict_proba(X_chain)[:, 1]

    # Create a new column name for the predictions
    # Make sure it's a string to keep consistency
    new_column_name = f'pred_{i}'

    # Add the predictions as a column to X_chain
    X_chain[new_column_name] = predictions

[LightGBM] [Info] Number of positive: 22045, number of negative: 221609
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.086997 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 977
[LightGBM] [Info] Number of data points in the train set: 243654, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090477 -> initscore=-2.307829
[LightGBM] [Info] Start training from score -2.307829
[LightGBM] [Info] Number of positive: 10002, number of negative: 233652
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3427
[LightGBM] [Info] Number of data points in the train set: 243654, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.041050 -> initscore=-3.151048

In [None]:
from sklearn.metrics import roc_auc_score
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']

AUC1 = roc_auc_score(Y_train[target_cols[0]], X_chain['pred_0'])
AUC2 = roc_auc_score(Y_train[target_cols[1]], X_chain['pred_1'])
AUC3 = roc_auc_score(Y_train[target_cols[2]], X_chain['pred_2'])
AUC4 = roc_auc_score(Y_train[target_cols[3]], X_chain['pred_3'])
AUC5 = roc_auc_score(Y_train[target_cols[4]], X_chain['pred_4'])

In [None]:
print(f'AUC score for Kitchen = {AUC1}')
print(f'AUC score for LivingR = {AUC2}')
print(f'AUC score for Bedroom = {AUC3}')
print(f'AUC score for Hallway = {AUC4}')
print(f'AUC score for Bathroom= {AUC5}')

AUC score for Kitchen = 0.8992389921967441
AUC score for LivingR = 0.8908130168001973
AUC score for Bedroom = 0.9049970602898736
AUC score for Hallway = 0.8944621459577013
AUC score for Bathroom= 0.8915677460960925


In [None]:
print(f'Mean AUC score on Training Data = {np.average([AUC1, AUC2, AUC3, AUC4, AUC5])}')

Mean AUC score on Training Data = 0.8962157922681218


In [None]:
X_chain_test = X_test.copy()

for i, clf in enumerate(classifiers):
    # Predict the probability of activity for the current room
    predictions = clf.predict_proba(X_chain_test)[:, 1]

    # Create a new column name for the predictions
    new_column_name = f'pred_{i}'

    # Add the predictions as a column to X_chain_test
    X_chain_test[new_column_name] = predictions

    # Convert all column names to strings to avoid the TypeError
    X_chain_test.columns = X_chain_test.columns.astype(str)




In [None]:
from sklearn.metrics import roc_auc_score
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']

AUC1 = roc_auc_score(Y_test[target_cols[0]], X_chain_test['pred_0'])
AUC2 = roc_auc_score(Y_test[target_cols[1]], X_chain_test['pred_1'])
AUC3 = roc_auc_score(Y_test[target_cols[2]], X_chain_test['pred_2'])
AUC4 = roc_auc_score(Y_test[target_cols[3]], X_chain_test['pred_3'])
AUC5 = roc_auc_score(Y_test[target_cols[4]], X_chain_test['pred_4'])

In [None]:
print(f'AUC score for Kitchen = {AUC1}')
print(f'AUC score for LivingR = {AUC2}')
print(f'AUC score for Bedroom = {AUC3}')
print(f'AUC score for Hallway = {AUC4}')
print(f'AUC score for Bathroom= {AUC5}')

AUC score for Kitchen = 0.7842300175584807
AUC score for LivingR = 0.6830681744665812
AUC score for Bedroom = 0.6120852389353435
AUC score for Hallway = 0.7059728845864224
AUC score for Bathroom= 0.6311489304881523


In [None]:
print(f'Mean AUC score on Testing Data = {np.average([AUC1, AUC2, AUC3, AUC4, AUC5])}')

Mean AUC score on Testing Data = 0.683301049206996


# Submission


In [None]:
bestModels = [lgb.LGBMClassifier(colsample_bynode=0.7369772402241518,
                colsample_bytree=0.8722703653329802, lambda_l1=6.547813371706801,
                lambda_l2=6.8707764962528906, learning_rate=0.06198412809074967,
                max_bin=499, max_depth=7, min_data_in_leaf=39),
 lgb.LGBMClassifier(colsample_bynode=0.578446480666374,
                colsample_bytree=0.8916520957785858, lambda_l1=8.065801718205977,
                lambda_l2=3.9265073019954384, learning_rate=0.09408430622725157,
                max_bin=60, max_depth=14, min_data_in_leaf=103),
 lgb.LGBMClassifier(colsample_bynode=0.641803207460583,
                colsample_bytree=0.8817664314308815,
                lambda_l1=1.6934489777933779, lambda_l2=7.269856936057833,
                learning_rate=0.03506093115895017, max_bin=349, max_depth=9,
                min_data_in_leaf=184),
 lgb.LGBMClassifier(colsample_bynode=0.666483532895658,
                colsample_bytree=0.40898655648040316, lambda_l1=9.29572107629556,
                lambda_l2=1.263103832080631, learning_rate=0.06980049342544237,
                max_bin=161, max_depth=12, min_data_in_leaf=75),
 lgb.LGBMClassifier(colsample_bynode=0.8699669814150692,
                colsample_bytree=0.4987011998600954,
                lambda_l1=1.4540618505176355, lambda_l2=6.21152322047457,
                learning_rate=0.032205922867640985, max_bin=398, max_depth=12,
                min_data_in_leaf=159)]


In [None]:
train_set_path = '/content/drive/MyDrive/Aindo Hackathon Feb 3-4 2024/data_synth_train.csv'
df_data = pd.read_csv(train_set_path)
df_data

Unnamed: 0,timestamp,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,Living room,Bedroom,Bathroom,Kitchen,Hallway
0,2019-11-07 00:00:07,21.26,57.74,109,531,489,705,659,600,122,0,0,0,0,0
1,2019-11-07 00:00:27,21.19,58.03,107,533,494,707,658,603,122,0,0,0,0,0
2,2019-11-07 00:00:47,21.11,57.53,107,530,491,708,656,596,121,1,0,0,0,0
3,2019-11-07 00:01:07,21.12,57.94,106,532,489,710,656,597,126,1,0,0,0,0
4,2019-11-07 00:01:27,21.06,58.18,103,532,490,705,658,597,130,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304563,2020-01-17 03:51:43,22.05,55.20,102,530,441,692,651,583,108,0,0,0,0,0
304564,2020-01-17 03:52:03,21.93,55.13,104,529,441,694,651,583,108,0,0,0,0,0
304565,2020-01-17 03:52:23,22.09,55.12,101,529,443,694,651,584,109,0,0,0,0,0
304566,2020-01-17 03:52:43,22.04,55.22,101,529,441,694,651,585,108,0,0,0,0,0


In [None]:
test_set_path = '/content/drive/MyDrive/Aindo Hackathon Feb 3-4 2024/data_synth_test.csv'
df_test_data = pd.read_csv(test_set_path)
df_test_data

Unnamed: 0,timestamp,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue
0,2020-01-17 03:53:23,21.98,55.12,102,530,444,695,652,586,109
1,2020-01-17 03:53:43,22.08,55.07,102,529,444,696,650,585,107
2,2020-01-17 03:54:03,22.14,55.13,102,528,442,694,651,583,108
3,2020-01-17 03:54:23,22.07,55.21,102,529,442,694,651,585,108
4,2020-01-17 03:54:43,22.03,55.07,101,530,443,694,651,583,108
...,...,...,...,...,...,...,...,...,...,...
64076,2020-01-31 23:58:37,22.85,52.94,073,535,467,691,647,572,129
64077,2020-01-31 23:58:57,22.79,52.99,072,535,467,691,646,572,129
64078,2020-01-31 23:59:17,22.74,53.05,072,535,467,691,646,572,129
64079,2020-01-31 23:59:37,22.71,53.42,073,535,467,691,646,572,128


In [None]:
df = pd.concat([df_data, df_test_data], ignore_index=True)
df

Unnamed: 0,timestamp,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,Living room,Bedroom,Bathroom,Kitchen,Hallway
0,2019-11-07 00:00:07,21.26,57.74,109,531,489,705,659,600,122,0.0,0.0,0.0,0.0,0.0
1,2019-11-07 00:00:27,21.19,58.03,107,533,494,707,658,603,122,0.0,0.0,0.0,0.0,0.0
2,2019-11-07 00:00:47,21.11,57.53,107,530,491,708,656,596,121,1.0,0.0,0.0,0.0,0.0
3,2019-11-07 00:01:07,21.12,57.94,106,532,489,710,656,597,126,1.0,0.0,0.0,0.0,0.0
4,2019-11-07 00:01:27,21.06,58.18,103,532,490,705,658,597,130,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368644,2020-01-31 23:58:37,22.85,52.94,073,535,467,691,647,572,129,,,,,
368645,2020-01-31 23:58:57,22.79,52.99,072,535,467,691,646,572,129,,,,,
368646,2020-01-31 23:59:17,22.74,53.05,072,535,467,691,646,572,129,,,,,
368647,2020-01-31 23:59:37,22.71,53.42,073,535,467,691,646,572,128,,,,,


In [None]:
df.to_csv('/content/drive/MyDrive/Aindo Hackathon Feb 3-4 2024/data_all.csv')

In [None]:
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']
df = pl.read_csv('/content/drive/MyDrive/Aindo Hackathon Feb 3-4 2024/data_all.csv', try_parse_dates=True)
df_target = df.select(target_cols)

schema_data = df.schema
schema_target = df_target.schema

In [None]:
df_features = generate_features(df)

df_features = df_features.to_pandas()
# a little proportion of target values are null

df_features['is_weekend'] = df_features['weekday'].apply(lambda x: 1 if x >= 5 else 0)


In [None]:
df = df_features

In [None]:
df_train = df[:304568]
df_test = df[304568:]
df_test.pop('')

304568    304568
304569    304569
304570    304570
304571    304571
304572    304572
           ...  
368644    368644
368645    368645
368646    368646
368647    368647
368648    368648
Name: , Length: 64081, dtype: int64

In [None]:
df_train.pop('')

0              0
1              1
2              2
3              3
4              4
           ...  
304563    304563
304564    304564
304565    304565
304566    304566
304567    304567
Name: , Length: 304568, dtype: int64

In [None]:
df_test = df_test.drop(columns = target_cols)

In [None]:
df_test.shape

(64081, 96)

In [None]:
columns_to_scale_basic = ["temperature", "humidity", "CO2CosIRValue", "CO2MG811Value",
    "MOX1", "MOX2", "MOX3", "MOX4", "COValue", 'weekday']

columns_to_scale = columns_to_scale_basic.copy()

for sensor_column in columns_to_scale_basic:
  columns_to_scale.append(f"{sensor_column}_lag1")
  columns_to_scale.append(f"{sensor_column}_lag2")
  columns_to_scale.append(f"{sensor_column}_lag3")
  columns_to_scale.append(f"{sensor_column}_lag4")
  columns_to_scale.append(f"{sensor_column}_lag5")
  columns_to_scale.append(f"{sensor_column}_lag6")
  columns_to_scale.append(f"{sensor_column}_lag7")

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
df_train_scaled = df_train.copy()
df_train_scaled[columns_to_scale] = scaler.fit_transform(df_train_scaled[columns_to_scale])

# Transform the test data using the same scaler
df_test_scaled = df_test.copy()
df_test_scaled[columns_to_scale] = scaler.transform(df_test_scaled[columns_to_scale])

# Now, train_data_scaled and test_data_scaled have the specified features scaled
# You can proceed with using these datasets for training and testing your models

In [None]:
# For Classifier Chains we select the following order:
# Kitchen -> Living room -> Bedroom -> Hallway -> Bathroom
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']

################### X/Y TRAIN division  ####################
Y_train = df_train_scaled[target_cols]

## Get the feature columns by dropping the target columns from the DataFrame
X_train = df_train_scaled.drop(columns=target_cols, errors='ignore')

################## X/Y TEST division  ######################

X_test = df_test_scaled.drop(columns=target_cols, errors='ignore')

In [None]:
X_train

Unnamed: 0,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,second,...,weekday_lag6,weekday_lag7,cos(minute)_lag1,cos(minute)_lag2,cos(minute)_lag3,cos(minute)_lag4,cos(minute)_lag5,cos(minute)_lag6,cos(minute)_lag7,is_weekend
0,0.104047,1.387942,0.197555,0.096721,0.137521,0.112134,-0.021348,0.257960,0.400707,7,...,,,,,,,,,,0
1,0.069427,1.458345,0.124979,0.154190,0.248181,0.170232,-0.049240,0.325860,0.400707,27,...,,,1.000000,,,,,,,0
2,0.029862,1.336960,0.124979,0.067986,0.181785,0.199281,-0.105024,0.167426,0.361608,47,...,,,1.000000,,,,,,,0
3,0.034808,1.436496,0.088691,0.125455,0.137521,0.257378,-0.105024,0.190060,0.557105,7,...,,,1.000000,,,,,,,0
4,0.005134,1.494761,-0.020174,0.125455,0.159653,0.112134,-0.049240,0.190060,0.713503,27,...,,,0.998630,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304563,0.494753,0.771308,-0.056462,0.067986,-0.924812,-0.265502,-0.244485,-0.126807,-0.146686,43,...,0.50083,0.500881,-0.891007,-0.866025,-0.809017,-0.743145,-0.707107,-0.258819,0.707107,1
304564,0.435405,0.754315,0.016114,0.039252,-0.924812,-0.207404,-0.244485,-0.126807,-0.146686,3,...,0.50083,0.500881,-0.891007,-0.866025,-0.809017,-0.777146,-0.707107,-0.258819,0.707107,1
304565,0.514536,0.751886,-0.092750,0.039252,-0.880548,-0.207404,-0.244485,-0.104174,-0.107586,23,...,0.50083,0.500881,-0.913545,-0.866025,-0.838671,-0.777146,-0.707107,-0.258819,0.669131,1
304566,0.489808,0.776164,-0.092750,0.039252,-0.924812,-0.207404,-0.244485,-0.081541,-0.146686,43,...,0.50083,0.500881,-0.913545,-0.891007,-0.838671,-0.777146,-0.743145,-0.309017,0.669131,1


In [None]:
Y_train_np.shape

(304568, 5)

In [None]:
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']

In [None]:
X_chain.shape

(304568, 97)

In [None]:
# Now train the final classifier chain with optimized hyperparameters
classifiers = []
X_chain = X_train.copy()

for i, target in enumerate(target_cols):
    clf = bestModels[i]
    clf.fit(X_chain, Y_train[target])

    # Store the trained model
    classifiers.append(clf)

    # Add the predictions as a feature for the next classifier in the chain
    predictions = clf.predict_proba(X_chain)[:, 1]
    X_chain[f'pred_{target}'] = predictions

# Now you can evaluate your classifier chain as before


[LightGBM] [Info] Number of positive: 26724, number of negative: 277844
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.090360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22186
[LightGBM] [Info] Number of data points in the train set: 304568, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.087744 -> initscore=-2.341498
[LightGBM] [Info] Start training from score -2.341498
[LightGBM] [Info] Number of positive: 11256, number of negative: 293312
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.103546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5202
[LightGBM] [Info] Number of data points in the train set: 304568, number of used features: 97
[LightGBM] [I

In [None]:
target_cols = ['Kitchen','Living room',	'Bedroom',	'Hallway',	'Bathroom']

In [None]:
X_chain_test = X_test.copy()

for i, clf in enumerate(classifiers):
    # Predict the probability of activity for the current room
    predictions = clf.predict_proba(X_chain_test)[:, 1]

    # Create a new column name for the predictions
    new_column_name = f'pred_{target_cols[i]}'

    # Add the predictions as a column to X_chain_test
    X_chain_test[new_column_name] = predictions

    # Convert all column names to strings to avoid the TypeError
    X_chain_test.columns = X_chain_test.columns.astype(str)




In [None]:
X_chain_test[target_cols]

Unnamed: 0,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,second,...,cos(minute)_lag4,cos(minute)_lag5,cos(minute)_lag6,cos(minute)_lag7,is_weekend,pred_Kitchen,pred_Living room,pred_Bedroom,pred_Hallway,pred_Bathroom
304568,0.460133,0.751886,-0.056462,0.067986,-0.858416,-0.178355,-0.216593,-0.058907,-0.107586,23,...,-0.809017,-0.743145,-0.309017,0.629320,1,0.008979,0.007401,0.024671,0.008143,0.033609
304569,0.509590,0.739748,-0.056462,0.039252,-0.858416,-0.149306,-0.272377,-0.081541,-0.185785,43,...,-0.809017,-0.777146,-0.358368,0.629320,1,0.009922,0.007641,0.025117,0.008888,0.033238
304570,0.539263,0.754315,-0.056462,0.010517,-0.902680,-0.207404,-0.244485,-0.126807,-0.146686,3,...,-0.838671,-0.777146,-0.358368,0.629320,1,0.010660,0.008189,0.022398,0.009221,0.035583
304571,0.504644,0.773736,-0.056462,0.039252,-0.902680,-0.207404,-0.244485,-0.081541,-0.146686,23,...,-0.838671,-0.777146,-0.358368,0.587785,1,0.009814,0.007641,0.024242,0.009260,0.032249
304572,0.484862,0.739748,-0.092750,0.067986,-0.880548,-0.207404,-0.244485,-0.126807,-0.146686,43,...,-0.838671,-0.809017,-0.406737,0.587785,1,0.009078,0.007401,0.025406,0.008179,0.032464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368644,0.890405,0.222649,-1.108819,0.211659,-0.349381,-0.294551,-0.356053,-0.375774,0.674404,37,...,-0.933580,-0.891007,-0.544639,0.258819,1,0.025191,0.008203,0.009118,0.007813,0.041657
368645,0.860731,0.234788,-1.145108,0.211659,-0.349381,-0.294551,-0.383945,-0.375774,0.674404,57,...,-0.933580,-0.913545,-0.587785,0.258819,1,0.021166,0.009051,0.008056,0.008703,0.045687
368646,0.836002,0.249354,-1.145108,0.211659,-0.349381,-0.294551,-0.383945,-0.375774,0.674404,17,...,-0.951057,-0.913545,-0.587785,0.258819,1,0.025713,0.007703,0.009492,0.008021,0.044517
368647,0.821165,0.339178,-1.108819,0.211659,-0.349381,-0.294551,-0.383945,-0.375774,0.635304,37,...,-0.951057,-0.913545,-0.587785,0.207912,1,0.027579,0.007916,0.009458,0.008021,0.044517


In [None]:
targets = ['Living room',	'Bedroom',	'Bathroom',	'Kitchen',	'Hallway']
target_pred = []
for i, target in enumerate(targets):
  new_column_name = f'pred_{targets[i]}'
  target_pred.append(new_column_name)
target_pred

['pred_Living room',
 'pred_Bedroom',
 'pred_Bathroom',
 'pred_Kitchen',
 'pred_Hallway']

In [None]:
X_chain_test_pred = X_chain_test[target_pred]
X_chain_test_pred

Unnamed: 0,pred_Living room,pred_Bedroom,pred_Bathroom,pred_Kitchen,pred_Hallway
304568,0.007401,0.024671,0.033609,0.008979,0.008143
304569,0.007641,0.025117,0.033238,0.009922,0.008888
304570,0.008189,0.022398,0.035583,0.010660,0.009221
304571,0.007641,0.024242,0.032249,0.009814,0.009260
304572,0.007401,0.025406,0.032464,0.009078,0.008179
...,...,...,...,...,...
368644,0.008203,0.009118,0.041657,0.025191,0.007813
368645,0.009051,0.008056,0.045687,0.021166,0.008703
368646,0.007703,0.009492,0.044517,0.025713,0.008021
368647,0.007916,0.009458,0.044517,0.027579,0.008021


In [None]:
X_chain_test_pred.rename(columns = {'pred_Living room': 'Living room', 'pred_Bedroom': 'Bedroom',
                                    'pred_Bathroom': 'Bathroom', 'pred_Kitchen': 'Kitchen', 'pred_Hallway':  'Hallway'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chain_test_pred.rename(columns = {'pred_Living room': 'Living room', 'pred_Bedroom': 'Bedroom',


In [None]:
submission_example = '/content/drive/MyDrive/Aindo Hackathon Feb 3-4 2024/data_submission_example.csv'
df_test_data = pd.read_csv(submission_example)
df_test_data

Unnamed: 0,timestamp,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,Living room,Bedroom,Bathroom,Kitchen,Hallway
0,2020-01-17 03:53:23,21.98,55.12,102,530,444,695,652,586,109,0.775143,0.159306,0.127273,0.318387,0.874428
1,2020-01-17 03:53:43,22.08,55.07,102,529,444,696,650,585,107,0.553822,0.117559,0.502828,0.864512,0.693308
2,2020-01-17 03:54:03,22.14,55.13,102,528,442,694,651,583,108,0.476896,0.219174,0.573121,0.271307,0.122309
3,2020-01-17 03:54:23,22.07,55.21,102,529,442,694,651,585,108,0.435256,0.861889,0.983327,0.823282,0.721541
4,2020-01-17 03:54:43,22.03,55.07,101,530,443,694,651,583,108,0.417547,0.141556,0.152131,0.614859,0.051591
5,2020-01-17 03:55:03,21.96,54.94,101,529,444,694,650,584,108,0.565551,0.343182,0.07805,0.479979,0.690247
6,2020-01-17 03:55:23,21.95,55.02,102,527,442,694,652,583,108,0.122285,0.237852,0.065994,0.001159,0.24754
7,2020-01-17 03:55:43,21.97,55.14,102,530,443,694,651,584,108,0.92678,0.034154,0.004221,0.41528,0.463874
8,2020-01-17 03:56:03,22.03,55.01,102,528,441,695,652,585,108,0.211374,0.235855,0.698105,0.890966,0.904901
9,2020-01-17 03:56:23,21.86,55.08,103,529,443,693,650,584,108,0.133435,0.568274,0.219998,0.0799,0.118083


In [None]:
X_chain_test_pred

Unnamed: 0,level_0,index,Living room,Bedroom,Bathroom,Kitchen,Hallway
0,0,304568,0.007401,0.024671,0.033609,0.008979,0.008143
1,1,304569,0.007641,0.025117,0.033238,0.009922,0.008888
2,2,304570,0.008189,0.022398,0.035583,0.010660,0.009221
3,3,304571,0.007641,0.024242,0.032249,0.009814,0.009260
4,4,304572,0.007401,0.025406,0.032464,0.009078,0.008179
...,...,...,...,...,...,...,...
64076,64076,368644,0.008203,0.009118,0.041657,0.025191,0.007813
64077,64077,368645,0.009051,0.008056,0.045687,0.021166,0.008703
64078,64078,368646,0.007703,0.009492,0.044517,0.025713,0.008021
64079,64079,368647,0.007916,0.009458,0.044517,0.027579,0.008021


In [None]:
X_chain_test_pred = X_chain_test_pred.drop(columns = ['level_0', 'index'])
X_chain_test_pred

Unnamed: 0,Living room,Bedroom,Bathroom,Kitchen,Hallway
0,0.007401,0.024671,0.033609,0.008979,0.008143
1,0.007641,0.025117,0.033238,0.009922,0.008888
2,0.008189,0.022398,0.035583,0.010660,0.009221
3,0.007641,0.024242,0.032249,0.009814,0.009260
4,0.007401,0.025406,0.032464,0.009078,0.008179
...,...,...,...,...,...
64076,0.008203,0.009118,0.041657,0.025191,0.007813
64077,0.009051,0.008056,0.045687,0.021166,0.008703
64078,0.007703,0.009492,0.044517,0.025713,0.008021
64079,0.007916,0.009458,0.044517,0.027579,0.008021


In [None]:
test_set_path = '/content/drive/MyDrive/Aindo Hackathon Feb 3-4 2024/data_synth_test.csv'
df_test_data = pd.read_csv(test_set_path)
df_test_data

Unnamed: 0,timestamp,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue
0,2020-01-17 03:53:23,21.98,55.12,102,530,444,695,652,586,109
1,2020-01-17 03:53:43,22.08,55.07,102,529,444,696,650,585,107
2,2020-01-17 03:54:03,22.14,55.13,102,528,442,694,651,583,108
3,2020-01-17 03:54:23,22.07,55.21,102,529,442,694,651,585,108
4,2020-01-17 03:54:43,22.03,55.07,101,530,443,694,651,583,108
...,...,...,...,...,...,...,...,...,...,...
64076,2020-01-31 23:58:37,22.85,52.94,073,535,467,691,647,572,129
64077,2020-01-31 23:58:57,22.79,52.99,072,535,467,691,646,572,129
64078,2020-01-31 23:59:17,22.74,53.05,072,535,467,691,646,572,129
64079,2020-01-31 23:59:37,22.71,53.42,073,535,467,691,646,572,128


In [None]:
final_df = pd.concat([df_test_data, X_chain_test_pred], axis = 1)
final_df

Unnamed: 0,timestamp,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,Living room,Bedroom,Bathroom,Kitchen,Hallway
0,2020-01-17 03:53:23,21.98,55.12,102,530,444,695,652,586,109,0.007401,0.024671,0.033609,0.008979,0.008143
1,2020-01-17 03:53:43,22.08,55.07,102,529,444,696,650,585,107,0.007641,0.025117,0.033238,0.009922,0.008888
2,2020-01-17 03:54:03,22.14,55.13,102,528,442,694,651,583,108,0.008189,0.022398,0.035583,0.010660,0.009221
3,2020-01-17 03:54:23,22.07,55.21,102,529,442,694,651,585,108,0.007641,0.024242,0.032249,0.009814,0.009260
4,2020-01-17 03:54:43,22.03,55.07,101,530,443,694,651,583,108,0.007401,0.025406,0.032464,0.009078,0.008179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64076,2020-01-31 23:58:37,22.85,52.94,073,535,467,691,647,572,129,0.008203,0.009118,0.041657,0.025191,0.007813
64077,2020-01-31 23:58:57,22.79,52.99,072,535,467,691,646,572,129,0.009051,0.008056,0.045687,0.021166,0.008703
64078,2020-01-31 23:59:17,22.74,53.05,072,535,467,691,646,572,129,0.007703,0.009492,0.044517,0.025713,0.008021
64079,2020-01-31 23:59:37,22.71,53.42,073,535,467,691,646,572,128,0.007916,0.009458,0.044517,0.027579,0.008021


In [None]:
final_df.reset_index(drop=True, inplace=True)

In [None]:
final_df.to_csv('/content/drive/MyDrive/Aindo Hackathon Feb 3-4 2024/PixelPioneers.csv', index = False)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Aindo Hackathon Feb 3-4 2024/PixelPioneers.csv')
df

Unnamed: 0,timestamp,temperature,humidity,CO2CosIRValue,CO2MG811Value,MOX1,MOX2,MOX3,MOX4,COValue,Living room,Bedroom,Bathroom,Kitchen,Hallway
0,2020-01-17 03:53:23,21.98,55.12,102,530,444,695,652,586,109,0.007401,0.024671,0.033609,0.008979,0.008143
1,2020-01-17 03:53:43,22.08,55.07,102,529,444,696,650,585,107,0.007641,0.025117,0.033238,0.009922,0.008888
2,2020-01-17 03:54:03,22.14,55.13,102,528,442,694,651,583,108,0.008189,0.022398,0.035583,0.010660,0.009221
3,2020-01-17 03:54:23,22.07,55.21,102,529,442,694,651,585,108,0.007641,0.024242,0.032249,0.009814,0.009260
4,2020-01-17 03:54:43,22.03,55.07,101,530,443,694,651,583,108,0.007401,0.025406,0.032464,0.009078,0.008179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64076,2020-01-31 23:58:37,22.85,52.94,073,535,467,691,647,572,129,0.008203,0.009118,0.041657,0.025191,0.007813
64077,2020-01-31 23:58:57,22.79,52.99,072,535,467,691,646,572,129,0.009051,0.008056,0.045687,0.021166,0.008703
64078,2020-01-31 23:59:17,22.74,53.05,072,535,467,691,646,572,129,0.007703,0.009492,0.044517,0.025713,0.008021
64079,2020-01-31 23:59:37,22.71,53.42,073,535,467,691,646,572,128,0.007916,0.009458,0.044517,0.027579,0.008021


In [None]:
X_chain_test = X_test.copy()

for i, clf in enumerate(classifiers):
    # Predict the probability of activity for the current room
    predictions = clf.predict_proba(X_chain_test)[:, 1]

    # Create a new column name for the predictions
    new_column_name = f'pred_{i}'

    # Add the predictions as a column to X_chain_test
    X_chain_test[new_column_name] = predictions

    # Convert all column names to strings to avoid the TypeError
    X_chain_test.columns = X_chain_test.columns.astype(str)


## dsaf


In [None]:
def lgb_objective_production(trial):
    params = {
        'n_estimators': 300,
        'verbose': -1,
        'random_state': 42,
        'objective': 'binary',
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('lambda_l1', 1e-4, 10.0),
        'reg_lambda': trial.suggest_float('lambda_l2', 1e-4, 10.0),
        'min_child_samples': trial.suggest_int('min_data_in_leaf', 4, 256),
        'max_depth': trial.suggest_int('max_depth', 5, 16),
        'max_bin': trial.suggest_int('max_bin', 32, 1024),
    }

    model = lgb.LGBMClassifier(**params)
    X, y = train_data.drop(columns=target_cols), train_data["Living room"]
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)

    # Use cross_val_score directly without early stopping rounds
    scores = cross_val_score(model, X, y, cv=tscv, scoring='roc_auc')

    return np.mean(scores)

def optimize_hyperparameters():
    study = optuna.create_study(direction='maximize')
    study.optimize(lgb_objective_production, n_trials=50)  # Adjust the number of trials as needed
    print("Best trial:")
    print(study.best_trial)

In [None]:
optimize_hyperparameters()

# Model

In [None]:
# Let's build separate models for produciton and consumption of electricity
model_parameters = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'colsample_bytree': 0.9,
    'colsample_bynode': 0.6,
    'lambda_l1': 3.5,
    'lambda_l2': 1.5,
    'max_depth': 12
}

# standard technique to use mean predictions from several models with different random_state
model_consumption = VotingRegressor([
    (
        f'consumption_lgb_{i}',
         lgb.LGBMRegressor(**model_parameters, random_state=i)
    ) for i in range(10)
])
model_production = VotingRegressor([
    (
        f'production_lgb_{i}',
         lgb.LGBMRegressor(**model_parameters, random_state=i)
    ) for i in range(10)
])



mask = df_train_features['is_consumption'] == 1
model_consumption.fit(
    X=df_train_features[mask].drop(columns=["target"]),
    y=df_train_features[mask]["target"]
)

mask = df_train_features['is_consumption'] == 0
model_production.fit(
    X=df_train_features[mask].drop(columns=["target"]),
    y=df_train_features[mask]["target"]
)

# Submit API

In [None]:
for (
    test,
    revealed_targets,
    client,
    historical_weather,
    forecast_weather,
    electricity_prices,
    gas_prices,
    sample_prediction
) in iter_test:

    test = test.rename(columns={"prediction_datetime": "datetime"})

    df_test = pl.from_pandas(test[data_cols[1:]], schema_overrides=schema_data)
    df_client = pl.from_pandas(client[client_cols], schema_overrides=schema_client)
    df_gas_prices = pl.from_pandas(gas_prices[gas_prices_cols], schema_overrides=schema_gas)
    df_electricity_prices = pl.from_pandas(electricity_prices[electricity_prices_cols], schema_overrides=schema_electricity)
    df_new_forecast_weather = pl.from_pandas(forecast_weather[forecast_weather_cols], schema_overrides=schema_forecast)
    df_new_historical_weather = pl.from_pandas(historical_weather[historical_weather_cols], schema_overrides=schema_historical)
    df_new_target = pl.from_pandas(revealed_targets[target_cols], schema_overrides=schema_target)

    df_forecast_weather = pl.concat([df_forecast_weather, df_new_forecast_weather]).unique(['forecast_datetime', 'latitude', 'longitude', 'hours_ahead'])
    df_historical_weather = pl.concat([df_historical_weather, df_new_historical_weather]).unique(['datetime', 'latitude', 'longitude'])
    df_target = pl.concat([df_target, df_new_target]).unique(['datetime', 'county', 'is_business', 'product_type', 'is_consumption'])

    df_test_features = generate_features(
        df_test,
        df_client,
        df_gas_prices,
        df_electricity_prices,
        df_forecast_weather,
        df_historical_weather,
        df_weather_station_to_county_mapping,
        df_target
    )
    df_test_features = to_pandas(df_test_features)

    mask = df_test_features['is_consumption'] == 1
    # clip method makes values < 0 equal 0 because our target is nonnegative and models can produce negative values
    sample_prediction.loc[mask.values, "target"] = model_consumption.predict(df_test_features[mask]).clip(0)

    mask = df_test_features['is_consumption'] == 0
    sample_prediction.loc[mask.values, "target"] = model_production.predict(df_test_features[mask]).clip(0)

    # send predictions
    env.predict(sample_prediction)