In [None]:
from typing import Any

import pandas as pd  
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import pmdarima as pm

In [None]:
def read_data(path: str, nskip: int = 0, sep: str = ';') -> pd.DataFrame:
    """
    path: Путь к csv с сигналами из конфигуратора  
    nskip: Число строк в начале файла, которые нужно пропустить  
    sep: Разделитель в csv файле
    """
    df = pd.read_csv(path, skiprows=nskip, sep=sep)
    return df
def preprocess_data(df: pd.DataFrame, accident_date: str = '') -> tuple[pd.DataFrame, pd.Series]:
    """
    df: Датафрейм, считанный из файла конфигуратора
    accident_date: Дата аварии (либо интересующего места). Место обрезки данных. Если из конфигуратора сразу получено как нужно, то оставить None
    """
    df = df.ffill()
    # Получение столбца с датой и временем
    new_row = pd.DataFrame([df.columns], columns=df.columns)
    df = pd.concat([new_row, df], axis=0).reset_index(drop=True)

    date_column = pd.to_datetime(df.iloc[:, 0], errors='coerce', format='%d.%m.%Y %H:%M:%S').dropna().reset_index(drop=True)

    array = df.iloc[0:3, 1::2].to_numpy().astype(str)

    print(array)

    cols = []

    for i in range(array.shape[1]):
        cols.append(array[0, i] + ' ' + array[1, i] + ' ' + array[2, i])

    cols = np.array(cols)

    # print(cols)

    delta = df.shape[0] - date_column.shape[0]
    signal_values = df.iloc[delta:, 1::2].reset_index(drop=True)
    signal_values = signal_values.apply(lambda x: x.str.replace(',','.'))
    signal_values = signal_values.apply(lambda x: pd.to_numeric(x, errors='coerce')) 

    cols = np.append(['date'], cols)
    signal_values = pd.DataFrame(pd.concat([date_column, signal_values], axis=1).values, columns=cols)

    X = signal_values.drop('date', axis=1).apply(lambda x: pd.to_numeric(x, errors='coerce'))
    # print(signal_values.info())
    y = signal_values['date']

    # Обрезка данных, если задана дата поломки
    # Иначе берём весь набор данных
    if accident_date != '':
        raise NotImplementedError
    
    return X, y

def plot_lineplot(x: pd.Series, y: pd.Series, name: str, ax: Any, figsize = (1200, 1200)) -> None:
    """
    x: Набор временных меток (временной ряд), либо последовательность 1..n (0..n-1), где n == len(y)
    y: Значения в ith момент времени
    name: Имя прямой
    figsize: Размер графика
    """
    # fig, ax = plt.subplots(figsize=(figsize[0] // 100, figsize[1] // 100))
    sns.lineplot(x=x.index, y=y, ax=ax)
    plt.ylabel('')
    plt.xlabel('')
    ax.set_title(name)

In [None]:
df = read_data('../data/raw/month_csv_acceleration.csv', nskip=8, sep=';')
df, date = preprocess_data(df)
# plot_lineplot(date, df[df.columns[0]], df.columns[0], interactive=True)
tmp = pd.concat([df, date], axis=1)
tmp = tmp.replace([np.inf, -np.inf], np.nan)
tmp = tmp.dropna().reset_index(drop=True)
df = tmp.drop('date', axis=1)
date = tmp['date']
fig, ax = plt.subplots(1, 1)
for i in range(df.shape[1]):
    plot_lineplot(date, (df[df.columns[i]]), ax=ax, name='')
df

In [None]:
df.shape

In [None]:
df.isna().any()

In [None]:
from scipy.optimize import curve_fit
def poly(x, a, b):
    return (1 / a) * np.exp(-b * x)

# date, 1 / df[df.columns[i]]
x = np.arange(0, date.shape[0])
y = 1 / (df[df.columns[0]] + 1)
y2 = df[df.columns[0]]

popt, pcov = curve_fit(poly, x, y)

In [None]:
plt.plot(x, y)
plt.plot(poly(x, *popt))

In [None]:
df

In [None]:
sns.heatmap(df.corr(), 
            mask=np.triu(np.ones_like(df.corr(), dtype=bool)) | (np.abs(df.corr()) < .5),
            annot=True)

Декомпозиция

In [None]:
pd.Series((1 / df[df.columns[0]]).values, index=np.arange(0, len(df)))

In [None]:
# q = pd.Series((1 / df[df.columns[0]]).values, index=np.arange(0, len(df)))
# plt.rc("figure", figsize=(16,8))
# res = seasonal_decompose(q, model='additive', period = int(len(df) / 2))
# res.plot().suptitle('Additive Decompose')
# plt.xticks(rotation=90)
# plt.show()

In [None]:
# plt.plot(res.seasonal)

In [None]:
# trace = go.Scatter(x=np.arange(len(date)), y=res.seasonal, mode='lines', name='Seasonality')
# layout = go.Layout(title='Seasonality', xaxis=dict(title='Date'), yaxis=dict(), width=1200, height=1200)
# figure = go.Figure(data=[trace], layout=layout)
# del trace, layout
# figure.show()

6045 - 881

In [None]:
# # 6045 - 881
# trace = go.Scatter(x=date, y=res.seasonal, mode='lines', name='Seasonality')
# layout = go.Layout(title='Seasonality', xaxis=dict(title='Date'), yaxis=dict(), width=1200, height=1200)
# figure = go.Figure(data=[trace], layout=layout)
# del trace, layout
# figure.show()

In [None]:
class Component:
    def __init__(self, 
                 data: np.ndarray,
                 direction: str,
                 idx: 'str' ) -> None:
        self.direction = direction
        self.data = data
        self.idx = idx
    
    def __repr__(self):
        return f'data: {self.data}, direction: {self.direction}, idx: {self.idx}'

In [None]:
def split(names: list[str]) -> dict[str, list[str]]:
    name_groups = dict()
    # format of names[i]: name acronym number metric name, join last 2
    splitted_names = [elem.split() for elem in names]
    for elem in splitted_names:
        acronym, direction, idx = elem[1][:-1], elem[1][-1], elem[2]
        if acronym not in name_groups:
            name_groups[acronym] = [(direction, idx)]
        else:
            name_groups[acronym].append((direction, idx))
    return name_groups

In [None]:
dataset = split(df.columns.to_list())

In [None]:
dataset

Разбиваем по компонентам.

In [None]:
component_collection: dict[str, list[Component]] = dict()
for k in dataset:
    for c in df.columns:
        if (k[-1] not in component_collection):
            if k in c:
                component_collection[k[-1]] = [Component(df[c].to_numpy(), dataset[k][0][0], dataset[k][0][1])]
            else:
                continue
        else:
            if k in c:
                component_collection[k[-1]].append(Component(df[c].to_numpy(), dataset[k][0][0], dataset[k][0][1]))
            else:
                continue
component_collection    

Если имя совпадает (кроме последней буквы), то направление и индекс багаются. Нужно пофиксить. Дальше можно будет обучать модели.

In [None]:
dataset

In [None]:
df.columns.tolist().pop()

In [None]:
component_collection: dict[str, list[Component]] = dict()
for k in dataset:
    for i, c in enumerate(df.columns):
        if (k[-1] not in component_collection):
            if k in c:
                component_collection[k[-1]] = [Component(df[c].to_numpy(), dataset[k][0][0], dataset[k][0][1])]
                print(i, c)
            else:
                continue
        else:
            if k in c:
                component_collection[k[-1]].append(Component(df[c].to_numpy(), dataset[k][0][0], dataset[k][0][1]))
                print(i, c)
            else:
                continue
component_collection    

In [None]:
d_arr = np.array([])
n_arr = np.array([])
for elem in component_collection['Д']:
    if d_arr.size == 0:
        d_arr = np.array(elem.data)
    else:
        d_arr = np.vstack([d_arr, np.array(elem.data)])
d_arr = d_arr.T

for elem in component_collection['Н']:
    if n_arr.size == 0:
        n_arr = np.array(elem.data)
    else:
        n_arr = np.vstack([n_arr, np.array(elem.data)])
n_arr = n_arr.T

d_arr, n_arr

In [None]:
X = np.arange(0, d_arr[:, 0].shape[0], 1)
y = d_arr[:, 0]

Split data

In [None]:
pos =  int(0.8 * X.shape[0])
X_train, X_test, y_train, y_test =  X[:pos], X[pos:], y[:pos], y[pos:]
# Check if split correct
assert ((X_train.shape[0] + X_test.shape[0]) == X.shape[0])
assert ((y_train.shape[0] + y_test.shape[0]) == y.shape[0])

In [None]:
# plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120})


# # Original Series
# fig, axes = plt.subplots(1, 3, sharex=True)
# # axes[0, 0].plot(y_train); axes[0, 0].set_title('Original Series')
# plot_acf(y_train, ax=axes[0], lags=np.arange(0, 20, 1))

# # 1st Differencing
# # axes[1, 0].plot(pd.Series(y_train).diff()); axes[1, 0].set_title('1st Order Differencing')
# plot_acf(pd.Series(y_train).diff().dropna(), ax=axes[1], lags=np.arange(0, 20, 1))

# # 2nd Differencing
# # axes[2, 0].plot(pd.Series(X_train).diff().diff()); axes[2, 0].set_title('2nd Order Differencing')
# plot_acf(pd.Series(y_train).diff().diff().dropna(), ax=axes[2], lags=np.arange(0, 20, 1))

# plt.show()
# # model = ARIMA()

In [None]:
# plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120})


# # Original Series
# fig, axes = plt.subplots(1, 3, sharex=True)
# # axes[0, 0].plot(y_train); axes[0, 0].set_title('Original Series')
# plot_pacf(y_train, ax=axes[0], lags=np.arange(0, 20, 1))

# # 1st Differencing
# # axes[1, 0].plot(pd.Series(y_train).diff()); axes[1, 0].set_title('1st Order Differencing')
# plot_pacf(pd.Series(y_train).diff().dropna(), ax=axes[1], lags=np.arange(0, 20, 1))

# # 2nd Differencing
# # axes[2, 0].plot(pd.Series(X_train).diff().diff()); axes[2, 0].set_title('2nd Order Differencing')
# plot_pacf(pd.Series(y_train).diff().diff().dropna(), ax=axes[2], lags=np.arange(0, 20, 1))

# plt.show()
# # model = ARIMA()

In [None]:
print(adfuller(y, maxlag=50, regression='ct', autolag=None))
print(kpss(y, regression='ct', nlags=50))

In [None]:
pd.DataFrame(y_train).info()

P должно быть равно не больше, чем 4
D = 2 (или 1)
Q = 3 (либо 2)

In [None]:
print('Start fit ARIMA')
model = ARIMA(y_train, order=(2, 1, 2)) # damped holt`s exponential smoothing
r = model.fit()
print(r.summary())
# print(r.forecast(25, alpha=0.05))

print('Start forecast')
fc = r.forecast(1000)

fc_series = pd.Series(fc, index=np.arange(pos, pos + 1000, 1))

print('Start plotting')
plt.figure(figsize=(12,5), dpi=100)
# plt.plot(np.arange(pos - 50000, pos, 1), y_train[-50000:], label='training')
plt.plot(np.arange(pos, pos + 1000, 1), y_test[:1000], label='actual')
plt.plot(fc_series, label='forecast')

plt.title('Forecast vs Actuals')
plt.legend(loc='lower left', fontsize=8)
plt.show()

### FIT AUTO.ARIMA

In [None]:
seasonal_decompose(y_train[-1000:], model='add',  period = int(len(y_train[-1000:])/ 2)).plot()

In [None]:
model = pm.auto_arima(y_train[-1000:], start_p=1, start_q=1, max_p=5, max_q=5,
                      start_P=0, 
                      trace=True,
                      n_jobs=8,  # depends on cpu
                      error_action='ignore',
                      suppress_warnings=True,
                      seasonal=True,
                      m=500,
                      stepwise=False, random=True, random_state=42,
                      n_fits=50)

model.summary()

In [None]:
pred = model.predict(n_periods=100)

In [None]:
plt.plot(y_train[-100:])
plt.axhline(y_train[-100:].mean(), c='r')
plt.plot(np.arange(100, 200, 1), pred, label='actual')

In [None]:
date

In [None]:
history = [y for y in y_train[-50:]]
predictions =[]

for i in range(1, len(y_test[:10000])):
  # print(i)
  model = ARIMA(history[-50:], order=(0, 2, 2)) # holt's smoothing
  model = model.fit()

  yhat = model.forecast()[0]
  predictions.append(yhat)

  obs = y_test[i]
  history.append(obs)

# f, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 4))
# sns.lineplot(y_test, ax=ax)
# sns.lineplot(predictions, ax=ax)

In [None]:
sns.lineplot(y_test[:10000])
sns.lineplot(predictions)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2)
sns.lineplot(predictions, ax=ax[0])
sns.lineplot(y_test[:10000], ax=ax[1])

In [None]:
def exponential_moving_average(data, window):
    weights = np.exp(np.linspace(-1., 0., window))
    weights /= weights.sum()
    ema = np.convolve(data, weights, mode='full')[:len(data)]
    ema[:window] = ema[window]
    return ema

In [None]:
window = 10
# ema = exponential_moving_average(y, window)[:100000]
ema = y[:720000]
z = np.polyfit(np.arange(0, ema.shape[0]), ema, 1)
trend = np.poly1d(z)(np.arange(0, ema.shape[0]))
plt.plot(ema)
plt.plot(trend)

Fit Holt Winters (aka ARIMA(0, 3, 3))

In [None]:
print('Start fit ARIMA')
model = ARIMA(y_train, order=(3, 1, 2)) # damped holt`s exponential smoothing
r = model.fit()
print(r.summary())
# print(r.forecast(25, alpha=0.05))

print('Start forecast')
fc = r.forecast(100000)

fc_series = pd.Series(fc, index=np.arange(pos, pos + 100000, 1))

print('Start plotting')
plt.figure(figsize=(12,5), dpi=100)
# plt.plot(np.arange(pos - 50000, pos, 1), y_train[-50000:], label='training')
plt.plot(np.arange(pos, pos + 100000, 1), y_test[:100000], label='actual')
plt.plot(fc_series, label='forecast')

plt.title('Forecast vs Actuals')
plt.legend(loc='lower left', fontsize=8)
plt.show()