In [None]:
import os
from dotenv import load_dotenv

from typing import Final

import mlflow
import mlflow.entities
import mlflow.data.pandas_dataset
from mlflow.data.sources import (
    LocalArtifactDatasetSource
)

import pandas as pd
import numpy as np

from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE 
from sklearn.preprocessing import (
    MinMaxScaler,
    StandardScaler
)

from DataLoader import (
    loader,
    config
)

from Processer import preprocesser

import pickle
import PIL

import scripts.compress_datatypes as S

In [None]:
load_dotenv()

In [None]:
os.environ["MLFLOW_TRACKING_USERNAME"] = os.getenv("TRACKING_USER")
os.environ["MLFLOW_TRACKING_PASSWORD"] = os.getenv("TRACKING_PSWD")
os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "9000"

In [None]:
EXPERIMENT_NAME: str = "COMBINE FEATURES, BUILD ITS"
mlflow.set_tracking_uri(f"http://{os.getenv('ADRESS')}:{os.getenv('PORT')}")
CURRENT_EXPERIMENT: mlflow.entities.Experiment = mlflow.set_experiment(EXPERIMENT_NAME)

LOAD DATASET FROM MLFLOW SERVER

In [None]:
# mlflow.artifacts.download_artifacts(run_id="36d231c830034e888ad1e04dd67741d5", artifact_path="")
SOURSE_URI: str = "mlflow-artifacts:/516249343314470222/36d231c830034e888ad1e04dd67741d5/artifacts/artifacts/data/feature_dataset_full.csv"
DST_PATH: str = "../data/processed/mixed"
mlflow.artifacts.download_artifacts(artifact_uri=SOURSE_URI, dst_path=DST_PATH)

LOAD DATASET AS PD DATAFRAME

In [None]:
feature_dataset = pd.read_csv(os.path.join(DST_PATH, "feature_dataset_full.csv/feature_dataset_full.csv"), index_col=0)

PLAN:
1. LOAD HISTORICAL FEATURES (RMSA, RMSA10, ETC...)
2. SMOOTH SIGNALS (OPTONAL)
3. 

In [None]:
fig, ax = plt.subplots()
sns.boxplot(feature_dataset, ax=ax);
plt.yscale("log");
plt.xticks(rotation=90);

RUN_NAME = "PCA_EXPAND_HISTORICAL"

with mlflow.start_run(run_name=RUN_NAME) as run:
    mlflow.log_figure(fig, "artifacts/charts/boxes.png")

In [None]:
scaler = StandardScaler()
_ = scaler.fit_transform(feature_dataset)
pca = PCA(n_components=2)
pca4 = pca.fit_transform(_)


In [None]:
fig, ax = plt.subplots()
sns.regplot(data=pca4, x=pca4[:, 0], y=pca4[:, 1], ax=ax);
sns.scatterplot(x=pca4[:, 0], y=pca4[:, 1], hue=feature_dataset.index.to_list(), legend=False, ax=ax);

with mlflow.start_run(run_id=mlflow.search_runs(filter_string=f"run_name='{RUN_NAME}'")["run_id"][0]) as run:
    mlflow.log_figure(fig, "artifacts/charts/pca2_reg.png")

CAN SEE 2 GROUPS => WANT FIND CHANGE POINT  
BUT HOW TO CONNECT CHANGE POINT TO RUL?

In [None]:
# HISTORICAL DATA
# ----------
RMSA_PATH: str = "../data/raw/historical/СКЗУ.csv"
RMSA10_PATH: str = "../data/raw/historical/СКЗУ10.csv"
RMSA_GROWTH: str = "../data/raw/historical/СКЗУ_РОСТ.csv"
RMSA_AMP: str = "../data/raw/historical/СКЗУ_АМПЛИТУДА.csv"
RMSD_SPAN: str = "../data/raw/historical/СКЗП_РАЗМАХ.csv"
PK_FACTOR: str = "../data/raw/historical/ПИК_ФАКТОР.csv"

In [None]:
dataset_paths = [RMSA_PATH, RMSA10_PATH, RMSA_GROWTH, RMSA_AMP, RMSD_SPAN, PK_FACTOR]
tmp = pd.DataFrame()
for path in dataset_paths:
    series_hist = pd.read_csv(path, skiprows=config.COUNT_SKIP, sep=';')
    series_hist = loader.fill_empty(loader.transform_header(series_hist))
    series_hist = preprocesser.compress(series_hist, floor='h', method="max")

    print(series_hist.shape)

    if tmp.size == 0:
        tmp = series_hist
    else:
        tmp = tmp.join(series_hist, on=tmp.index, how="inner").drop("key_0", axis=1)

In [None]:
feature_dataset.index = pd.to_datetime(feature_dataset.index)
date_series = pd.Series(feature_dataset.index)
feature_dataset.index = date_series.dt.floor('h')

In [None]:
feature_dataset = feature_dataset.join(tmp, on=feature_dataset.index, how="inner").drop("key_0", axis=1)

In [None]:
feature_dataset.to_csv("../data/processed/mixed/features.csv")

In [None]:
feature_dataset.columns = feature_dataset.columns.astype(str)

dataset = mlflow.data.pandas_dataset.from_pandas(
    feature_dataset, 
    name="HIST WITH STAT FEATURES",
    source=""
)

with mlflow.start_run(run_id=mlflow.search_runs(filter_string=f"run_name='{RUN_NAME}'")["run_id"][0]) as run:
    mlflow.log_input(dataset, context="COMBINE STAT WITH HIST FEATURES")
    mlflow.log_artifact("../data/processed/mixed/features.csv", "artifacts/data")
    mlflow.log_table(feature_dataset, "artifacts/data_json/features.json")

In [None]:
fig, ax = plt.subplots(13, figsize=(24,24))
for i, col in enumerate(feature_dataset.columns):
    sns.lineplot(feature_dataset[col], ax=ax[i])
    ax[i].set_xticklabels("")
    ax[i].set_ylabel(col[-10:])

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(feature_dataset)

pca = PCA().fit(scaled_features)
fig, ax = plt.subplots()
ax.plot(np.cumsum(pca.explained_variance_ratio_))
ax.axhline(0.95, c="green")
ax.axvspan(5, 6, alpha=.5, color="red")
ax.set_xlabel('number of components')
ax.set_ylabel('cumulative explained variance')
ax.set_title("Number of components vs explained variance")

In [None]:
pca = PCA(n_components=6)
pca6 = pca.fit_transform(scaled_features)
sns.pairplot(pd.DataFrame(pca6), corner=True, kind="reg")

In [None]:
scaler = StandardScaler()
_ = scaler.fit_transform(feature_dataset)

pca = PCA(n_components=2)
pca2 = pca.fit_transform(_)

sns.scatterplot(x=pca2[:, 0], y=pca2[:, 1])

In [None]:
feature_dataset["dategroup"] = np.where(feature_dataset.index < pd.Timestamp("12.01.2021"), 1, 2)

In [None]:
range1 = feature_dataset[feature_dataset.index < pd.Timestamp("12.01.2021")]
range2 = feature_dataset[(feature_dataset.index >= pd.Timestamp("12.01.2021")) &
                         (feature_dataset.index < pd.Timestamp("01.01.2022"))]
range3 = feature_dataset[(feature_dataset.index >= pd.Timestamp("01.01.2022")) &
                         (feature_dataset.index < pd.Timestamp("02.01.2022"))]
range4 = feature_dataset[feature_dataset.index >= pd.Timestamp("02.01.2022")]

In [None]:
feature_dataset["dategroup"].loc[range1.index] = 1
feature_dataset["dategroup"].loc[range2.index] = 2
feature_dataset["dategroup"].loc[range3.index] = 3
feature_dataset["dategroup"].loc[range4.index] = 4

In [None]:
dategroup = feature_dataset["dategroup"]
feature_dataset.drop("dategroup", axis=1, inplace=True)

In [None]:
scaler = StandardScaler()
_ = scaler.fit_transform(feature_dataset)

pca = PCA(n_components=2)
pca2 = pca.fit_transform(_)

sns.scatterplot(x=pca2[:, 0], y=pca2[:, 1], hue=dategroup)