In [1]:
import copy
import datetime
import gc
import logging
import os
from math import exp, floor, sqrt
from time import time, sleep
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
import seaborn as sns
import tensorflow as tf
import tqdm.notebook as tq
from minepy import MINE
from numpy import mean
from sklearn.cluster import KMeans
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import (accuracy_score, auc, f1_score, precision_score,
                             recall_score, roc_auc_score, silhouette_score)
from sklearn.model_selection import (GridSearchCV, RepeatedStratifiedKFold,
                                     ShuffleSplit, StratifiedShuffleSplit,
                                     cross_val_score)
from tqdm.notebook import tqdm
from tslearn.barycenters import (dtw_barycenter_averaging,
                                 dtw_barycenter_averaging_subgradient,
                                 euclidean_barycenter, softdtw_barycenter)
from tslearn.clustering import TimeSeriesKMeans
from tslearn.datasets import CachedDatasets
from tslearn.metrics import dtw, soft_dtw
from tslearn.preprocessing import (TimeSeriesResampler,
                                   TimeSeriesScalerMeanVariance)
from xgboost import XGBClassifier

sns.set()  # for plot styling

tf.get_logger().setLevel(logging.ERROR)

psutil.virtual_memory()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
smart_meter = pd.read_csv("smart_meter.csv")

In [3]:
smart_meter.shape

(107484000, 5)

In [7]:
smart_meter

Unnamed: 0,MeterID,Day,Time,Electricity,FDI
0,1236,2009-07-15,1,0.134070,True
1,1236,2009-07-15,2,0.129562,True
2,1236,2009-07-15,3,0.113616,True
3,1236,2009-07-15,4,0.186020,True
4,1236,2009-07-15,5,0.075942,True
...,...,...,...,...,...
107483995,6954,2010-12-31,44,0.618000,False
107483996,6954,2010-12-31,45,0.371000,False
107483997,6954,2010-12-31,46,0.344000,False
107483998,6954,2010-12-31,47,0.319000,False


In [14]:
electricity = smart_meter['Electricity']

In [15]:
electricity

0            0.134070
1            0.129562
2            0.113616
3            0.186020
4            0.075942
               ...   
107483995    0.618000
107483996    0.371000
107483997    0.344000
107483998    0.319000
107483999    0.362000
Name: Electricity, Length: 107484000, dtype: float64

In [17]:
electricity = electricity.to_numpy().reshape(-1,48)

In [117]:
np.array(dba_barycenter_result).shape

(530, 48, 1)

In [124]:
def calc_stuff(x):   # these are examples.
    result = dtw_barycenter_averaging_subgradient(x, max_iter=20, tol=1e-3)
    return result

def procedure(i):                 # just factoring out the
    x = []
    for j in range(4225):
        x.append(electricity[ j * 530 + i ])
    return calc_stuff(np.array(x).reshape(4225, 48, 1))

def main(PoolExecutor): # take ProcessPoolExecutor or ThreadPoolExecutor
    
    inputs = range(530) # 
    output1 = list()

    start = time()           # let's see how long this takes
    
    with PoolExecutor() as executor:
        for out1 in executor.map(procedure, inputs):
            output1.append(out1)
            
    finish = time()
    print(f'PoolExecutor: {PoolExecutor}')
    print(f'time : {(finish-start)}')
    return output1

In [125]:
dba_barycenter_result = main(ProcessPoolExecutor)



PoolExecutor: <class 'concurrent.futures.process.ProcessPoolExecutor'>
time : 92.36177825927734


In [126]:
def calc_stuff(x):   # these are examples.
    result = softdtw_barycenter(x, max_iter=10, gamma=1.)
    return result

def procedure(i):                 # just factoring out the
    x = []
    for j in range(4225):
        x.append(electricity[ j * 530 + i ])
    return calc_stuff(np.array(x).reshape(4225, 48, 1))

def main(PoolExecutor): # take ProcessPoolExecutor or ThreadPoolExecutor
    
    inputs = range(530) # 
    output1 = list()

    start = time()           # let's see how long this takes
    
    with PoolExecutor() as executor:
        for out1 in executor.map(procedure, inputs):
            output1.append(out1)
            
    finish = time()
    print(f'PoolExecutor: {PoolExecutor}')
    print(f'time : {(finish-start)}')
    return output1

In [127]:
sdtw_barycenter_result = main(ProcessPoolExecutor)

PoolExecutor: <class 'concurrent.futures.process.ProcessPoolExecutor'>
time : 1035.4732313156128


In [145]:
dba = []
sdtw = []
for i in range(len(dba_barycenter_result)):
    dba.append([t[0] for t in dba_barycenter_result[i] ])
    
for i in range(len(sdtw_barycenter_result)):
    sdtw.append([t[0] for t in sdtw_barycenter_result[i] ])
    
dba = pd.DataFrame.from_dict(dba)
sdtw = pd.DataFrame.from_dict(sdtw)

In [178]:
dba.shape, sdtw.shape

((530, 48), (530, 48))

In [180]:
dba.iloc[9]

0     0.475216
1     0.219563
2     0.146000
3     0.069826
4     0.296343
5     0.160963
6     0.198342
7     0.190085
8     0.191796
9     0.191633
10    0.191628
11    0.191610
12    0.191617
13    0.191619
14    0.191620
15    0.191620
16    0.191619
17    0.191619
18    0.191616
19    0.191619
20    0.191617
21    0.191620
22    0.191606
23    0.191629
24    0.191599
25    0.191636
26    0.191289
27    0.191943
28    0.191906
29    0.190445
30    0.190879
31    0.196124
32    0.177819
33    0.209692
34    0.099041
35    0.337689
36    0.993191
37    0.311075
38    0.138546
39    0.475397
40    2.173675
41    0.788471
42    0.344014
43    0.397127
44    0.134173
45    0.502345
46    1.338514
47    0.412997
Name: 9, dtype: float64

In [162]:
all_meter = np.array(smart_meter).reshape(-1, 530, 48, 5) # (4225, 530, 48, 5)

In [183]:
def calc_stuff(x,y):   # these are examples.
    meter_euclid = np.linalg.norm(x-y)
    meter_dtw = dtw(x,y)
    meter_sdtw = soft_dtw(x,y)
    return meter_euclid, meter_dtw, meter_sdtw

def procedure(j):                 # just factoring out the
#     print('start procedure')
    df_list = []
    d = j.reshape(-1, 5)
    meter = pd.DataFrame(d, columns=["MeterID","Day","Time","Electricity","FDI"])
    day_list = meter['Day'].reset_index()
    
#     print('start for loop')
    for i in range(530): # 
        meter_id = meter["MeterID"].loc[0]
        meter_day = day_list['Day'][i*48]
        meter_fdi = meter["FDI"].loc[0]
        x = meter["Electricity"].to_numpy()[i*48:(i+1)*48]
        y = dba.iloc[i]
        
        meter_euclid, meter_dtw, meter_sdtw = calc_stuff(x,y)
        se = {'MeterID': meter_id, 'Day': meter_day,'FDI': meter_fdi, "meter_euclid": meter_euclid, "meter_dtw":meter_dtw, "meter_sdtw":meter_sdtw}
        df_list.append(se)
    return df_list

def main(PoolExecutor): # take ProcessPoolExecutor or ThreadPoolExecutor
    print('start')
    inputs = all_meter # data_group[0] 有422個
    output1 = list()

    start = time()           # let's see how long this takes
    
    with PoolExecutor() as executor:
        for out1 in executor.map(procedure, inputs):
            # put results into correct output list:
            output1.append(out1)
            
    finish = time()
    print(f'PoolExecutor: {PoolExecutor}')
    print(f'time : {(finish-start)}')
    return output1

In [184]:
dba_distance = main(ProcessPoolExecutor)

PoolExecutor: <class 'concurrent.futures.process.ProcessPoolExecutor'>
time : 249.44612503051758


In [189]:
dba_np = np.array(dba_distance)

In [202]:
dba_np.shape

(4225, 530)

In [205]:
dba_list = []
for meter in dba_distance:
    for day in meter:
        dba_list.append(day)

In [206]:
dba_df = pd.DataFrame(dba_list)

In [207]:
dba_df

Unnamed: 0,MeterID,Day,FDI,meter_euclid,meter_dtw,meter_sdtw
0,1236,2009-07-15,True,2.931790,2.697003,-69.102430
1,1236,2009-07-16,True,2.899283,1.964247,-70.214551
2,1236,2009-07-17,True,3.123288,1.724819,-67.936155
3,1236,2009-07-18,True,3.423259,2.645615,-64.757519
4,1236,2009-07-19,True,3.230180,1.772408,-66.748386
...,...,...,...,...,...,...
2239245,6954,2010-12-27,False,4.860049,2.071172,-60.018025
2239246,6954,2010-12-28,False,5.312420,2.635191,-50.409350
2239247,6954,2010-12-29,False,5.099870,2.176597,-58.832767
2239248,6954,2010-12-30,False,3.532870,1.933502,-67.212282


In [208]:
e = dba_df.to_csv(index=False)
f = open('all_area_dba_barycenter.csv','w')
f.write(e) #Give your csv text here.
f.close()

In [209]:
def calc_stuff(x,y):   # these are examples.
    meter_euclid = np.linalg.norm(x-y)
    meter_dtw = dtw(x,y)
    meter_sdtw = soft_dtw(x,y)
    return meter_euclid, meter_dtw, meter_sdtw

def procedure(j):                 # just factoring out the
#     print('start procedure')
    df_list = []
    d = j.reshape(-1, 5)
    meter = pd.DataFrame(d, columns=["MeterID","Day","Time","Electricity","FDI"])
    day_list = meter['Day'].reset_index()
    
#     print('start for loop')
    for i in range(530): # 
        meter_id = meter["MeterID"].loc[0]
        meter_day = day_list['Day'][i*48]
        meter_fdi = meter["FDI"].loc[0]
        x = meter["Electricity"].to_numpy()[i*48:(i+1)*48]
        y = sdtw.iloc[i]
        
        meter_euclid, meter_dtw, meter_sdtw = calc_stuff(x,y)
        se = {'MeterID': meter_id, 'Day': meter_day,'FDI': meter_fdi, "meter_euclid": meter_euclid, "meter_dtw":meter_dtw, "meter_sdtw":meter_sdtw}
        df_list.append(se)
    return df_list

def main(PoolExecutor): # take ProcessPoolExecutor or ThreadPoolExecutor
    
    inputs = all_meter # data_group[0] 有422個
    output1 = list()

    start = time()           # let's see how long this takes
    
    with PoolExecutor() as executor:
        for out1 in executor.map(procedure, inputs):
            # put results into correct output list:
            output1.append(out1)
            
    finish = time()
    print(f'PoolExecutor: {PoolExecutor}')
    print(f'time : {(finish-start)}')
    return output1

In [210]:
sdtw_distance = main(ProcessPoolExecutor)

PoolExecutor: <class 'concurrent.futures.process.ProcessPoolExecutor'>
time : 269.32499289512634


In [211]:
sdtw_list = []
for meter in sdtw_distance:
    for day in meter:
        sdtw_list.append(day)

In [212]:
sdtw_df = pd.DataFrame(sdtw_list)

In [213]:
e = sdtw_df.to_csv(index=False)
f = open('all_area_sdtw_barycenter.csv','w')
f.write(e) #Give your csv text here.
f.close()

In [214]:
sdtw_df[""]

Unnamed: 0,MeterID,Day,FDI,meter_euclid,meter_dtw,meter_sdtw
0,1236,2009-07-15,True,2.290820,1.979150,-72.931267
1,1236,2009-07-16,True,2.187721,1.673913,-73.963475
2,1236,2009-07-17,True,2.468129,1.544353,-73.150540
3,1236,2009-07-18,True,2.768943,2.252936,-69.324980
4,1236,2009-07-19,True,2.310693,1.676262,-71.154314
...,...,...,...,...,...,...
2239245,6954,2010-12-27,False,3.616652,1.991596,-67.444601
2239246,6954,2010-12-28,False,4.751490,2.526374,-60.961150
2239247,6954,2010-12-29,False,4.350194,2.217898,-63.964463
2239248,6954,2010-12-30,False,2.242897,1.203493,-73.982068


In [173]:
a = pd.DataFrame(all_meter[0].reshape(-1, 5), columns=["MeterID","Day","Time","Electricity","FDI"])

In [176]:
a["Electricity"].to_numpy().shape

(25440,)

In [None]:
def calc_stuff(x,y):   # these are examples.
    meter_euclid = np.linalg.norm(x-y)
    meter_dtw = dtw(x,y)
    meter_sdtw = soft_dtw(x,y)
    return meter_euclid, meter_dtw, meter_sdtw

def procedure(j):                 # just factoring out the
    df_list = []
    d = j.reshape(-1, 5)
    meter = pd.DataFrame(d, columns=["MeterID","Day","Time","Electricity","FDI"])
    day_list = meter['Day'].reset_index()
    for i in range(530): # 
        
        meter_id = meter["MeterID"].loc[0]
        meter_day = day_list['Day'][i*48]
        meter_fdi = meter["FDI"].loc[0]
        x = meter["Electricity"].to_numpy()[i*48:(i+1)*48]
        y = center[i*48:(i+1)*48]
#         if i == 529:
#             print(f'meter_id : {meter_id} , x.shape :{ x.shape}, y.shape: {y.shape},{i*48,(i+1)*48}')
        meter_euclid, meter_dtw, meter_sdtw = calc_stuff(x,y)
        se = {'MeterID': meter_id, 'Day': meter_day,'FDI': meter_fdi, "meter_euclid": meter_euclid, "meter_dtw":meter_dtw, "meter_sdtw":meter_sdtw}
        df_list.append(se)
    return df_list

def main(PoolExecutor, area_num): # take ProcessPoolExecutor or ThreadPoolExecutor
    
    inputs = data_group[area_num] # data_group[0] 有422個
    output1 = list()

    start = time()           # let's see how long this takes
    
    with PoolExecutor() as executor:
        for out1 in executor.map(procedure, inputs):
            # put results into correct output list:
            output1.append(out1)
            
    finish = time()
    print(f'PoolExecutor: {PoolExecutor}')
    print(f'time : {(finish-start)}')
    return output1