##### Debug mode

In [1]:
%load_ext autoreload
%autoreload 2

##### For relative imports

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

##### Notebook

In [3]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import mpld3
from interval import Interval
from observation import Observation
from constants import *
from datetime import timedelta
from tools import sequence_to_interval
plt.style.use('ggplot')
mpld3.enable_notebook()

PATH = "../../../Data/GMPP_IRSDI/"
reactor_sites = [site+tranche for site in ["A","B","C","D","E","F","G","H"] for tranche in ["1","2"]] + ["B3","B4","F3","F4"]

suffixes = [
"DEB1-1","DEB1-2","DEB1-3"#,"DEB1-4", # Débit de fuite au joint 1 (Gamme Large)
# "DEB2-1","DEB2-2","DEB2-3","DEB2-4", # Débit de fuite au joint 1 (Gamme Étroite)
# "DEB3-1","DEB3-2","DEB3-3","DEB3-4","DEB3-5", # Débit d'injection au joint
# "PUI-",  # Puissance thermique moyenne
# "PRE-",  # Pression
# "TEM1-", # Température ligne d'injection aux joints (en * Celsius) ### A rapprocher de DEB3
# "TEM2-", # Température fuites joint 1
# "TEM3-1","TEM3-2","TEM3-3","TEM3-4",# Température eau joint 1 - 051PO ### A rapprocher de DEB1 DEB2
# "VIT-1","VIT-2","VIT-3","VIT-4"# Vitesse de rotation
] 

obs_dict = {}

for reactor_site in reactor_sites:
    print("Loading observation "+reactor_site)
    obs_dict[reactor_site] = Observation(PATH,reactor_site,suffixes,verbose=0)



Loading observation A1
Loading observation A2
Loading observation B1
Loading observation B2
Loading observation C1
Loading observation C2
Loading observation D1
Loading observation D2
Loading observation E1
Loading observation E2
Loading observation F1
Loading observation F2
Loading observation G1
Loading observation G2
Loading observation H1
Loading observation H2
Loading observation B3
Loading observation B4
Loading observation F3
Loading observation F4


### Normal mode

In [4]:
# Re- exectute that everytime you have a 
# TypeError: super(type, obj): obj must be an instance or subtype of type
from scale import *
from feature import *
from tools import *
from score import *

### Defining Dataframe wrapper and highlight function

In [None]:
class MyDataFrame(object):
    def __init__(self, df):
        self.df = df
        self.list_df = [df]
        self.n_updates = 0
    def update(self,df):
        self.df = df
        self.list_df+=[df]
        self.n_updates += 1
    def last_indices_removed(self):
        return self.list_df[-2].index.difference(self.list_df[-1].index)
    def __str__(self):            
        return str(len(self.df))  if (len(self.df)<=999) else ">999"

# Style should be an attribute of the dataframe
    
def highlight_not_null(df_colors=['green'], null_color='red',unknown_color='white'):
    def style_function(s):
        colors = []
        for p in s: 
            if pd.isnull(p):
                colors+=[null_color]
            elif type(p) is MyDataFrame:
                colors+=[df_colors[p.n_updates]]
            else:
                colors+=[unknown_color]
        return ['background-color: '+color for color in colors]
    return style_function

### Initializing features_df

In [None]:
scales_tag = ["10m","3h","6h","1d","7d","30d","180d"]
features_tag = ["trend","step","spike","oscillation","correlation"]
features_df = pd.DataFrame(index=features_tag,columns=scales_tag,dtype=np.float64)
features_df.style.apply(highlight_not_null(['yellow']))

<hr style="height:3px">

### Define slice-resample functions

This is to handle the fact that we resample the dataframe, and then split it according to the periods of interest.

We remove the low regime, powering ups and powering downs of each cycle.

In [None]:
def score_list_df(list_df, feature):
    list_score_df = []
    for df in list_df:
        list_score_df += [feature.score(df,[deb1[0]])]
    return pd.concat(list_score_df, axis=0)
    
def list_subsampled(obs, scale):
    return obs.low_regime_intervals.split_between(scale.scale(obs.full_concatenated_df), time=timedelta(days=3))

def scale_feature(scale, feature):
    return score_list_df(list_subsampled(obs,scale),feature)

### Features description

In [None]:
features_list = [
#     ("trend","1d",HourScale(1), Trend(24)),  # 1 days
#     ("trend","7d",HourScale(6), Trend(28)),  # 7 days
#     ("trend","30d",DayScale(1), Trend(30)),  # 30 days
#     ("trend","180d",DayScale(6), Trend(30)),  # 180 days
#     ("step","6h",MinutesScale(), Step(36)), # 6 hours
#     ("step","1d",HourScale(1), Step(24)),   # 1 days
#     ("step","7d",HourScale(6), Step(28)),   # 7 days
      ("debCorr","1d",HourScale(1),MinCorrelationDeb(24)), #1 day
#     ("correlation","6h",MinutesScale(), Correlation(36)), # 6 hours
#     ("correlation","1d",HourScale(1), Correlation(24)),   # 1 days
#     ("correlation","7d",HourScale(6), Correlation(28)),   # 7 days
#     ("correlation","30d",DayScale(1), Correlation(30)),  # 30 days
#     ("spike","10m",MinutesScale(), Spike()),
#     ("oscillation","3h",MinutesScale(),Oscillation(3,18)),
#     ("oscillation","6h",MinutesScale(),Oscillation(3,36))
]

### Compute features

In [None]:
for rule_tag, scale_tag, scale, rule in features_list:
    features_df.loc[rule_tag,scale_tag] = MyDataFrame(scale_feature(scale,rule))

Visualization of the features dataframe

In [None]:
features_df.style.apply(highlight_not_null(['yellow']))

To add a column, it's as simple as:

In [None]:
features_df.loc["debCorr"] = [None for scale in scales_tag]

And to save a features dataframe:

In [None]:
import pickle
with open('/home/mehlman/Data/features_df.pkl', 'wb') as output:
    pickle.dump(features_df, output, pickle.HIGHEST_PROTOCOL)

<hr style="height:3px">

### Describe scores

In [None]:
scales_tag = ["10m","3h","6h","1d","7d","30d","180d"]
scores_tag = ["trend_up","trend_down","step","spike","oscillation","debCorr"]
scores_kept_df = pd.DataFrame(index=scores_tag,columns=scales_tag,dtype=np.float64)

### Compute scores

In [None]:
for scale_tag in scales_tag:
    trend_feature = features_df.loc["trend",scale_tag]
    scores_kept_df.loc["trend_up",scale_tag] = np.nan if pd.isnull(trend_feature) else MyDataFrame(trend_feature.df)
    scores_kept_df.loc["trend_down",scale_tag] = np.nan if pd.isnull(trend_feature) else MyDataFrame(-trend_feature.df)
    
    step_feature = features_df.loc["step",scale_tag]
    scores_kept_df.loc["step",scale_tag] = np.nan if pd.isnull(step_feature) else MyDataFrame(step_feature.df.abs())
    
    spike_feature = features_df.loc["spike",scale_tag]
    scores_kept_df.loc["spike",scale_tag] = np.nan if pd.isnull(spike_feature) else MyDataFrame(spike_feature.df)
    
    oscillation_feature = features_df.loc["oscillation",scale_tag]
    scores_kept_df.loc["oscillation",scale_tag] = np.nan if pd.isnull(oscillation_feature) else MyDataFrame(oscillation_feature.df)
        
    debCorr_feature = features_df.loc["debCorr",scale_tag]
    scores_kept_df.loc["debCorr",scale_tag] = np.nan if pd.isnull(debCorr_feature) else MyDataFrame(1-debCorr_feature.df.abs())

Visualization of the scores dataframe

In [None]:
colors_scores_kept = ["#3afffb","#1ecebc","#25ed75","#26ad5c","#26ad5c","#26ad5c"]
scores_kept_df.style.apply(highlight_not_null(colors_scores_kept))

### Schedule analysis

In [None]:
analysis_list = [
    ("trend_up","1d",ScoreAnalysis(2,timedelta(days=1))),  # 1 days
    ("trend_up","7d",ScoreAnalysis(2,timedelta(days=7))),  # 7 days
    ("trend_up","30d",ScoreAnalysis(2,timedelta(days=30))),  # 30 days
    ("trend_up","180d",ScoreAnalysis(2,timedelta(days=180))),  # 180 days
    ("trend_down","1d",ScoreAnalysis(2,timedelta(days=1))),  # 1 days
    ("trend_down","7d",ScoreAnalysis(2,timedelta(days=7))),  # 7 days
    ("trend_down","30d",ScoreAnalysis(2,timedelta(days=30))),  # 30 days
    ("trend_down","180d",ScoreAnalysis(2,timedelta(days=180))),  # 180 days
    ("step","6h",ScoreAnalysis(50,timedelta(hours=6))), # 6 hours
    ("step","1d",ScoreAnalysis(50,timedelta(days=1))),   # 1 days
    ("step","7d",ScoreAnalysis(50,timedelta(days=7))),   # 7 days
    ("spike","10m",ScoreAnalysis(100,timedelta(hours=3))),
    ("oscillation","3h",ScoreAnalysis(350,timedelta(hours=5))),
    ("debCorr","1d",ScoreAnalysis(0.8,timedelta(days=1)))
]

### Compute analysis

Here, we **update** the objects in `scores_kept_df` so that we keep track of the evolution of everything

In [None]:
for rule_tag, scale_tag, analysis in analysis_list:
    df_score = scores_kept_df.loc[rule_tag,scale_tag].df
    scores_kept_df.loc[rule_tag,scale_tag].update(analysis.analyse_and_sort(df_score))

Visualization of the modified scores dataframe

In [None]:
scores_kept_df.style.apply(highlight_not_null(colors_scores_kept))

###  Combine all results

#### Remove overlapping

In [None]:
def remove_overlapping(obj_list,widths):
    for obj, new_score in zip(obj_list, combine([obj.df for obj in obj_list],[],widths)):
        obj.update(new_score)

widths = [timedelta(days=1),timedelta(days=7),timedelta(days=30),timedelta(days=180)]
remove_overlapping(scores_kept_df.loc["trend_up"].dropna().values,widths)
remove_overlapping(scores_kept_df.loc["trend_down"].dropna().values,widths)

widths = [timedelta(hours=6),timedelta(days=1),timedelta(days=7),timedelta(days=30)]
remove_overlapping(scores_kept_df.loc["step"].dropna().values,widths)

In [None]:
scores_kept_df.style.apply(highlight_not_null(colors_scores_kept))

#### Remove inclusions

Step up $\subset $ Increasing Trend

Step down $\subset $ Decreasing Trend

Spike $\bigcap $ Oscillation $\neq \varnothing$

In [None]:
def update_df(scores_to_update,scores_exempt,widths,multiply = None):
    df_updates = combine([obj.df for obj in scores_to_update],[obj.df for obj in scores_exempt],widths,multiply)
    for s,df in zip(scores_to_update,df_updates):
        s.update(df)

#### Remove

In [None]:
scores_to_update = [scores_kept_df.loc["trend_up","1d"],scores_kept_df.loc["trend_down","1d"],
                   scores_kept_df.loc["trend_up","7d"],scores_kept_df.loc["trend_down","7d"]] 
scores_exempt = [scores_kept_df.loc["step","6h"],scores_kept_df.loc["step","1d"],scores_kept_df.loc["step","7d"]]
widths = [timedelta(days=0),timedelta(days=0),timedelta(days=0),timedelta(days=0),timedelta(hours=6),timedelta(days=1),timedelta(days=7)]

update_df(scores_to_update,scores_exempt,widths)

scores_to_update = [scores_kept_df.loc["spike","10m"]]
scores_exempt = [scores_kept_df.loc["oscillation","3h"]]
widths = [timedelta(days=0),timedelta(hours=3)]

update_df(scores_to_update,scores_exempt,widths)

In [None]:
scores_kept_df.style.apply(highlight_not_null(colors_scores_kept))

### Save and load

To save and load this final result.

In [None]:
# with open('/home/mehlman/Data/scores_kept_df.pkl', 'wb') as output:
#     pickle.dump(scores_kept_df, output, pickle.HIGHEST_PROTOCOL)

# with open('/home/mehlman/Data/scores_kept_df.pkl', 'rb') as input:
#     scores_kept_df_2 = pickle.load(input)

<hr style="height:3px">

### Observation

We can simply inspect our results.

In [None]:
from interval import Interval

anomaly_indices = scores_kept_df.loc["debCorr","1d"].df.index.tolist()
indices = Interval(anomaly_indices,timedelta(hours=12)).intervals

In [None]:
begin,end = indices[50]
obs.full_concatenated_df[begin:end][deb1].plot()

We can look at the last indices removed; to check that we have **separated the trends from the steps**.

In [None]:
last_indices_removed = scores_kept_df.loc["trend_up","7d"].last_indices_removed().tolist()
indices = Interval(last_indices_removed,timedelta(days=3)).intervals

In [None]:
begin,end = indices[0]
obs.full_concatenated_df[begin:end][deb1[0]].plot()

In [None]:
from interval import Interval

test = Interval(scores_kept_df.loc["spike","10m"].df.index.tolist(), timedelta(days=1))
begin, end = test.intervals[5,0], test.intervals[10,0]
test.intervals_in([begin, end])

In [None]:
from datetime import datetime

def plot_portion(df, width, period,color='red', alpha = 0.3, lw = 4.0):
    begin, end  = period
    interval_object = Interval(df.index.tolist(), width)
    for begin,end in interval_object.intervals_in([begin, end]):
        obs.full_concatenated_df[deb1[0]][begin:end].plot(color=color, alpha = alpha, lw = lw) 

def plot_anomalies_for_period(periods):
    obs.full_concatenated_df[deb1[0]][period[0]:period[1]].plot(color='grey')
    plot_portion(scores_kept_df.loc["spike","10m"].df,timedelta(minutes = 10),period,color='yellow',alpha = 0.6)
    plot_portion(scores_kept_df.loc["oscillation","3h"].df,timedelta(hours = 2),period,color='k',alpha = 0.8)
    plot_portion(scores_kept_df.loc["trend_up","1d"].df,timedelta(hours = 12),period, color='#006600')
    plot_portion(scores_kept_df.loc["trend_up","7d"].df,timedelta(days = 3),period, color='#00ff00', alpha = 0.5)
    plot_portion(scores_kept_df.loc["trend_up","30d"].df,timedelta(days = 15),period, color='#80ff80', alpha = 0.5)
    plot_portion(scores_kept_df.loc["trend_down","1d"].df,timedelta(days = 1),period, color='#660000')
    plot_portion(scores_kept_df.loc["trend_down","7d"].df,timedelta(days = 3),period, color='#ff0000', alpha = 0.5)
    plot_portion(scores_kept_df.loc["trend_down","30d"].df,timedelta(days = 15),period, color='#ff8080', alpha = 0.5)
    plot_portion(scores_kept_df.loc["trend_down","180d"].df,timedelta(days = 90),period, color='#ff8080', alpha = 0.2)
    plot_portion(scores_kept_df.loc["step","6h"].df,timedelta(hours = 2),period, color='#000066')
    plot_portion(scores_kept_df.loc["step","1d"].df,timedelta(hours = 12),period, color='#0000ff', alpha = 0.5)
    plot_portion(scores_kept_df.loc["step","7d"].df,timedelta(days = 3),period, color='#8080ff', alpha = 0.5)

In [None]:
period = (datetime(2008, 9, 5),datetime(2009,2, 5))  
plot_anomalies_for_period(period)

In [None]:
period = (datetime(2011, 2, 1),datetime(2012, 1, 1)) 
plot_anomalies_for_period(period)

In [None]:
rolling_size = 10
a1 = obs.full_concatenated_df[deb1[0]].tail(100).values
a2 = obs.full_concatenated_df[deb1[1]].tail(100).values
a3 = obs.full_concatenated_df[deb1[2]].tail(100).values
np.corrcoef([a1,a2,a3]).min()