In [1]:
from data_processing.src.feature_extractor import DataGenerator
from data_processing.src.data_processor import DataProcessor
from paths import PATHS
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import os

# Modelling

This notebook exists in script format as well, ``modelling/train_model.py``

# Loading Data

Processed Disruption Data: This is the disruption data joined with RSSI data to get location information.

RSSI Combined data: This is the processed rssi data averaged over a day and Position of the train. 

In [None]:
disruption_df = pd.read_csv(
        DataProcessor.gen_proc_file_name("disruption.csv"),
        infer_datetime_format=True,
        parse_dates=True,
    )
    # ideally this should be from a database, as loading entirety of this data is super slow.
print("Following step takes a lot of time, around 30-35 min, go get a coffee!")
print("14 models into the future are trained")
print("Collecting RSSI historical Data")
rssi_comb_df = DataProcessor.combine_events(save=False)
print("Data collection Done!")

## Data preperation for training and evaluation

The disruptions from disruption_df are used to look for the instances in rssi_comb where the information for signal values are observed `h` days in the past. These values are saved and used to train for the disruptions.


(date, PositionNoLeap) -> DataGenerator -> historical values of signals 

### Data generation for training
For a position the signal values in neighbourhood of 20m are combined to remove effects of noise in the measurement. 

**Samples for disruption:**

If the event happens `today` and we want to train a model that predicts disruption 7 days into future then we would train the model from a window of [`today`-7-h, `today`-7] with training label of `disruption`.

**Normal samples:**

Since we also need to train the model with normal behaviour of the signal, we also sample random windows from the data and label them as `NO disruption`.

During training we sample equal proportion of disruptive and non disruptive samples.

### Feature Engineering

We investigated the A2_rssi (averaged over a day and positional window = +- 20 m). Then we calculated mean and standard deviation of this value for the chosen historical window. The reason for this choice was on assumption that at a location the RSSI value follows a normal distrbution with a fixed mean and standard deviation.

We calculated similar values for quality of signal 1 and signal 2 which are proportion of valid telegram received.


We also used the position as one of the feature as well because each position can have a different characteristics.

In [None]:
train_disruptions_df, test_disruptions_df = train_test_split(
        disruption_df, test_size=0.2
    )
features = ["A2_RSSI"]
train_disruptions_df, test_disruptions_df = train_test_split(
    disruption_df, test_size=0.2
)

# Model approach

We explored linear ridge classification for predicting disruption for its simplicity and GradientBoosting models for their fast inference and superior performance. In our experiments GradientBoosting performed better than a linear model.
 
 
 The best feature with most predictive power was the mean and standard deviation of A2_RSSI , other feature did not improve the validation performance.
 
 We train 7 models for predicting disruption into the future, one model for each day into the future starting from 2.

In [None]:
models = []
scores = []
for d in range(1, 14):
    train_data = DataGenerator(train_disruptions_df, rssi_comb_df, features)
    train_x, train_y = train_data.generate_samples(
        num_samples=300, prediction_days=d, history=20
    )
    test_data = DataGenerator(test_disruptions_df, rssi_comb_df, features)
    test_x, test_y = test_data.generate_samples(
        num_samples=50, prediction_days=d, history=20
    )
    model = lgb.LGBMClassifier()
    model.fit(X=train_x, y=train_y)
    print(
        f"Train accuracy score for {d} days into future  {model.score(train_x,train_y)}"
    )
    print(
        f"Test accuracy score for {d} days into future {model.score(test_x,test_y)}"
    )
    scores.append([model.score(test_x, test_y), model.score(train_x, train_y)])
    
    ## optionally one can save the model as here
#     model.booster_.save_model(
#         os.path.join(PATHS.data, "model", f"lgb_model_d{d}.txt")
#     )
    models.append(model)

print(scores)