<h1>Anomaly detection in time series using logistic regression</h1>

In [1]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('../data/resurfacing_marking_data.csv')

# Dataset and DataLoader

In [27]:
class TimeSeriesDataset:
    def __init__(self, path, seq_length, step_size, split='train', train_size=0.7, val_size=0.15, transform=None):
        self.data = pd.read_csv(path)
        self.seq_length = seq_length
        self.step_size = step_size
        self.transform = transform

        # split train, validation and test sampling 
        n = len(self.data)
        train_end = int(n * train_size)
        val_end = train_end + int(n * val_size)

        if split == 'train':
            self.data = self.data[:train_end]
        elif split == 'val':
            self.data = self.data[train_end:val_end]
        elif split == 'test':
            self.data = self.data[val_end:]
        else:
            raise ValueError('Split must be "train", "val" or "test"')
        

        self.data['time'] = pd.to_datetime(self.data['time'])

        # component minute
        self.data['minite_sin'] = np.sin(2 * np.pi * self.data['time'].dt.minute / 60)
        self.data['minite_cos'] = np.cos(2 * np.pi * self.data['time'].dt.minute / 60)

        # component hour
        self.data['hour_sin'] = np.sin(2 * np.pi * self.data['time'].dt.hour / 24)
        self.data['hour_cos'] = np.cos(2 * np.pi * self.data['time'].dt.hour / 24)

        # component day_of_week
        self.data['dow_sin'] = np.sin(2 * np.pi * self.data['time'].dt.day_of_week / 7)
        self.data['dow_cos'] = np.cos(2 * np.pi * self.data['time'].dt.day_of_week / 7)
    

    def __len__(self):
        return len(self.data) - self.seq_length - self.step_size + 1
    

    def __getitem__(self, idx):
        X = self.data.iloc[idx: idx + self.seq_length][[col for col in self.data.columns if col != 'time']].values
        y = self.data.iloc[idx + self.seq_length + self.step_size - 1]['flood']

        if self.transform:
            X = self.transform(X)

        return np.array(X, dtype=np.float32), np.array(y, dtype=np.int8)

In [28]:
path = '../data/resurfacing_marking_data.csv'
seq_length = 60
step_size = 30


train_dataset = TimeSeriesDataset(path, seq_length, step_size, split='train')
val_dataset = TimeSeriesDataset(path, seq_length, step_size, split='val')
test_dataset = TimeSeriesDataset(path, seq_length, step_size, split='test')

In [78]:
class DataLoader:
    def __init__(self, dataset, batch_size=32, shuffle=False, drop_last=False):
        self.dataset = dataset
        self.batch_size = batch_size
        self.suffle = shuffle
        self.indices = np.arange(len(dataset))
        self.drop_last = drop_last
    
    
    def __iter__(self):
        if self.suffle:
            np.random.shuffle(self.indices)
        self.current_idx = 0
        return self
    
    def __next__(self):
        if self.current_idx >= len(self.dataset):
            raise StopIteration
        
        end_idx = self.current_idx + self.batch_size
        if end_idx > len(self.dataset):
            if self.drop_last:
                raise StopIteration
            else:
                end_idx = len(self.dataset)

        batch_indices = self.indices[self.current_idx:self.current_idx + self.batch_size]
        batch = [self.dataset[idx] for idx in batch_indices]
        self.current_idx += self.batch_size
        X, y = zip(*batch)

        return np.array(X), np.array(y)

In [79]:
batch_size = 1440

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

# Train model

In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [82]:
model = LogisticRegression(max_iter=1000)

def train_and_evaluate(train_loader, val_loader, model):
    for X_batch, y_batch in train_loader:
        X = X_batch.reshape(batch_size, -1)
        y = y_batch.ravel()
        model.fit(X, y)
    
    y_pred, y_true = [], []
    for X_val, y_val in val_loader:
        X = X_val.reshape(batch_size, -1)
        y = y_val.ravel()
        y_pred.extend(model.predict(X))
        y_true.extend(y)
    
    # Вывод результатов
    print("Validation accuracy:", accuracy_score(y_true, y_pred))
    print("Classification report:\n", classification_report(y_true, y_pred))
    

train_and_evaluate(train_loader, val_loader, model)

Validation accuracy: 0.9932291666666667
Classification report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      5722
           1       0.00      0.00      0.00        38

    accuracy                           0.99      5760
   macro avg       0.50      0.50      0.50      5760
weighted avg       0.99      0.99      0.99      5760



In [85]:
train_dataset.data.flood.value_counts()

flood
0    27800
1      200
Name: count, dtype: int64