In [1]:
cd ../..

/home/msi/projects/diplomka


In [2]:
import numpy as np 

In [3]:
from sklearn.ensemble import IsolationForest
import pandas as pd
import pathlib
from datetime import datetime
import glob
import matplotlib.pyplot as plt

In [4]:
model_name = 'isolation_forest'

In [5]:
dataset = glob.glob("results/benchmark/RTT/*.csv", recursive=True)
benchmark_dir = pathlib.Path('results/benchmark/')

In [6]:
def load_df(path):
    df = pd.read_csv(path)
    df['timestamp'] = pd.to_datetime(df['timestamp'] if 'timestamp' in df.columns else df['index'])
    df = df.set_index('timestamp').sort_index()
    df = df[~df.index.duplicated(keep='first')]

    return df

In [7]:
def process(df):
    model = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.01, max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
    out = model.fit_predict(df[['rtt']])
    df[model_name] = out==-1
    return df[model_name]

In [8]:
total_times = []
per_sample_times = []

In [10]:
import numpy as np

In [11]:
unprocessed = []

In [12]:
for i, path in enumerate(dataset):
    df = load_df(path)
    
    start = datetime.now()

    start = datetime.now()
    try:
        current = process(df)
    except RuntimeError as e:
        unprocessed.append(path)
        current = df['value'].apply(lambda x: np.nan).rename(model_name)
    end = datetime.now()

    duration = end - start
    total_times.append(duration.total_seconds())
    per_sample_times.append(duration.total_seconds() / len(df))

    file_path = pathlib.Path(path[18:])
    file_dir = benchmark_dir / file_path.parent
    
    if not file_dir.exists():
        file_dir.mkdir(parents=True)

    out_path = file_dir / file_path.name
    
    current.index = current.index.rename('timestamp')
    if out_path.exists():
        existing = load_df(out_path)
        current = pd.concat([existing, current], axis=1)

    current.to_csv(out_path)
    clear_output(wait=True)




In [13]:
names = [pathlib.Path(path).name for path in dataset]

In [15]:
stats_df = pd.DataFrame({f'time': total_times, f'per_sample_{model_name}': per_sample_times}, index=names)

In [16]:
stats_df.to_csv(benchmark_dir / f'stats_{model_name}.csv')

In [14]:
unprocessed

[]