In [1]:
import cProfile

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from timefiller import TimeSeriesImputer

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
df = pd.read_hdf("../datasets/pems_bay/pems_bay.h5")
df.sample(3)

sensor_id,400001,400017,400030,400040,400045,400052,400057,400059,400065,400069,...,409525,409526,409528,409529,413026,413845,413877,413878,414284,414694
2017-01-18 06:00:00,64.099998,64.400002,63.900002,59.400002,61.799999,62.099998,65.400002,66.199997,65.400002,66.400002,...,63.900002,59.200001,65.199997,60.700001,66.099998,42.299999,68.800003,63.200001,66.199997,64.400002
2017-03-11 18:40:00,68.599998,63.900002,49.900002,68.599998,35.599998,68.099998,56.5,66.400002,66.300003,63.799999,...,70.099998,68.599998,66.800003,62.299999,67.599998,59.599998,68.699997,64.400002,66.800003,65.0
2017-06-04 09:50:00,70.0,64.900002,63.900002,59.299999,70.199997,66.400002,64.699997,67.900002,61.799999,64.699997,...,64.300003,67.5,63.5,60.700001,68.400002,59.0,62.400002,64.699997,69.800003,66.800003


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 52128 entries, 2017-01-01 00:00:00 to 2017-06-30 23:55:00
Freq: 5T
Columns: 325 entries, 400001 to 414694
dtypes: float32(325)
memory usage: 65.0 MB


In [4]:
def mask_at_random(df, ratio):
    if not 0 <= ratio <= 1:
        raise ValueError("Le ratio doit être compris entre 0 et 1.")
    masked_df = df.values.copy()
    total_elements = df.size
    num_elements_to_mask = int(total_elements * ratio)
    indices_to_mask = np.random.choice(total_elements, num_elements_to_mask, replace=False)
    rows, cols = np.unravel_index(indices_to_mask, df.shape)
    masked_df[rows, cols] = np.nan
    return pd.DataFrame(masked_df, index=df.index, columns=df.columns)

In [5]:
df_to_impute = mask_at_random(df, ratio=0.0005)
df_to_impute.isnull().mean().sort_values()

sensor_id
407331    0.000441
403414    0.000460
414284    0.000480
400911    0.000480
404370    0.000499
            ...   
400149    0.000940
401996    0.000959
401958    0.001036
407373    0.001036
407150    0.001036
Length: 325, dtype: float64

In [6]:
df_to_impute.isnull().sum().sum()

12368

In [7]:
with cProfile.Profile() as profil:
    TimeSeriesImputer(lags=(1, 2, 3, 4, 5, 6, 12), estimator=LinearRegression(positive=True))(df_to_impute)

profil.print_stats(sort='cumtime')

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 25] [3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 337, 336, 336, 337, 338]


  0%|          | 0/17 [00:00<?, ?it/s]

	optimask: 0.050s - (52087, 3)
	optimask: 0.059s - (52087, 4)
	optimask: 0.067s - (52087, 5)
	optimask: 0.052s - (52087, 6)
	optimask: 0.053s - (52087, 7)
	optimask: 0.054s - (52087, 8)
	optimask: 0.046s - (52087, 3)
	optimask: 0.049s - (52087, 4)
	optimask: 0.048s - (52087, 5)
	optimask: 0.049s - (52087, 6)
	optimask: 0.051s - (52087, 7)
	optimask: 0.053s - (52087, 8)
	optimask: 0.665s - (52087, 337)
	optimask: 0.607s - (52087, 336)
	optimask: 0.638s - (52087, 336)
	optimask: 0.650s - (52087, 337)
	optimask: 0.626s - (52087, 338)
         78277 function calls (76550 primitive calls) in 11.809 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   11.809   11.809 _time_series_imputer.py:40(__call__)
        1    0.021    0.021   11.714   11.714 _multivariate_imputer.py:107(__call__)
        1    0.116    0.116   11.693   11.693 _multivariate_imputer.py:86(_impute)
       17    4.197    0.247    6.75