In [1]:
!pip install tsfresh

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tsfresh
  Downloading tsfresh-0.19.0-py2.py3-none-any.whl (97 kB)
[K     |████████████████████████████████| 97 kB 6.3 MB/s 
[?25hCollecting stumpy>=1.7.2
  Downloading stumpy-1.11.1-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 57.5 MB/s 
Collecting matrixprofile<2.0.0,>=1.1.10
  Downloading matrixprofile-1.1.10-cp37-cp37m-manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 51.7 MB/s 
[?25hCollecting statsmodels>=0.13
  Downloading statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 61.3 MB/s 
Collecting protobuf==3.11.2
  Downloading protobuf-3.11.2-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 61.3 MB/s 
Installing collected packages: protobuf, stumpy, statsmodels, matrixprofile, tsfr

In [2]:
from google.colab import drive
from psutil import virtual_memory

drive.mount('/content/gdrive/', force_remount=True)
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Mounted at /content/gdrive/
Your runtime has 54.8 gigabytes of available RAM



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from functools import reduce
import os
import itertools

# from tsfresh import extract_features, extract_relevant_features
# from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, EfficientFCParameters
# from tsfresh.utilities.dataframe_functions import impute

plt.style.use('seaborn')

In [4]:
os.chdir("gdrive/MyDrive/Dissertation/ukdale")

In [5]:
house_list = [1, 2, 5]
dfs = []
for house in house_list:
    data = pd.read_csv(f'house_{house}/channel_1.dat', sep = ' ',names = ['Date','aggregate'])
    data['Date'] = pd.to_datetime(data['Date'],unit = 's')
    data = data.set_index(['Date'])
    if house == 1:
        data = data['2013-04':'2014-04']
    date = data.index.map(lambda x:x.floor('Min'))
    data = data.reset_index()
    data['Date'] = date
    print(f'House {house} of shape: {data.shape}')
    dfs.append(data)

dfs[1].head()

House 1 of shape: (5467707, 2)
House 2 of shape: (2780373, 2)
House 5 of shape: (1763101, 2)


Unnamed: 0,Date,aggregate
0,2013-02-17 16:17:00,340
1,2013-02-17 16:17:00,341
2,2013-02-17 16:17:00,347
3,2013-02-17 16:17:00,350
4,2013-02-17 16:17:00,342


In [6]:
bins = []
shape = []
for house in house_list: 
    df = pd.read_csv(f'processed_data/house{house}_labeled.csv', parse_dates=['Date'])
    shape.append(df.shape[0])
    df = df.reset_index()
    df.rename(columns={'index': 'id'}, inplace=True)
    if house == 2:
        df['id'] = df['id'] + shape[0]
    elif house == 5:
        df['id'] = df['id'] + shape[0] + shape[1]
    print(f'House {house} of shape: {df.shape}')
    bins.append(df)

bins[1].head()

House 1 of shape: (568303, 9)
House 2 of shape: (172771, 9)
House 5 of shape: (190043, 9)


Unnamed: 0,id,Date,aggregate,fridge,washing_machine,dishwasher,1,2,3
0,568303,2013-05-20 21:28:00,1010,0,0,0,0,all_off,all_off
1,568304,2013-05-20 21:29:00,2499,1,1,0,1,"washing_machine, fridge","washing_machine, fridge"
2,568305,2013-05-20 21:30:00,2234,1,1,0,1,"washing_machine, fridge","washing_machine, fridge"
3,568306,2013-05-20 21:31:00,2528,1,1,0,1,"washing_machine, fridge","washing_machine, fridge"
4,568307,2013-05-20 21:32:00,2503,1,1,0,1,"washing_machine, fridge","washing_machine, fridge"


In [7]:
bins[1].head()

Unnamed: 0,id,Date,aggregate,fridge,washing_machine,dishwasher,1,2,3
0,568303,2013-05-20 21:28:00,1010,0,0,0,0,all_off,all_off
1,568304,2013-05-20 21:29:00,2499,1,1,0,1,"washing_machine, fridge","washing_machine, fridge"
2,568305,2013-05-20 21:30:00,2234,1,1,0,1,"washing_machine, fridge","washing_machine, fridge"
3,568306,2013-05-20 21:31:00,2528,1,1,0,1,"washing_machine, fridge","washing_machine, fridge"
4,568307,2013-05-20 21:32:00,2503,1,1,0,1,"washing_machine, fridge","washing_machine, fridge"


In [8]:
houses = []
labels = []
idx = 0
for df, bin in zip(dfs,bins):
    h = pd.merge(df,bin[['Date','id']],on='Date', how='left')
    h = h.dropna()
    print(f'House {house_list[idx]} of shape: {h.shape}. Null Values = {h.isna().sum().sum()}')
    labels.append(bin[['1','2','3']])
    houses.append(h)
    idx += 1
    
houses[1].head()

House 1 of shape: (5467707, 3). Null Values = 0
House 2 of shape: (1675226, 3). Null Values = 0
House 5 of shape: (1763101, 3). Null Values = 0


Unnamed: 0,Date,aggregate,id
1105147,2013-05-20 21:28:00,252,568303.0
1105148,2013-05-20 21:28:00,254,568303.0
1105149,2013-05-20 21:28:00,252,568303.0
1105150,2013-05-20 21:28:00,252,568303.0
1105151,2013-05-20 21:29:00,250,568304.0


In [9]:
houses[2].tail()

Unnamed: 0,Date,aggregate,id
1763096,2014-11-13 17:51:00,606,931115
1763097,2014-11-13 17:52:00,608,931116
1763098,2014-11-13 17:52:00,619,931116
1763099,2014-11-13 17:52:00,612,931116
1763100,2014-11-13 17:52:00,616,931116


In [10]:
for idx, label in enumerate(labels):
    print(f'Length of house {house_list[idx]} labels : {label.shape[0]}')

Length of house 1 labels : 568303
Length of house 2 labels : 172771
Length of house 5 labels : 190043


In [11]:
for idx, house in enumerate(houses):
    length = len(np.unique(house['id']))
    print(f'No. of ids in house {house_list[idx]} : {length}')

No. of ids in house 1 : 568303
No. of ids in house 2 : 172771
No. of ids in house 5 : 190043


In [12]:
all_houses = pd.concat(houses)
all_houses = all_houses.reset_index(drop=True)
all_labels = pd.concat(labels)
all_labels = all_labels.reset_index(drop=True)

print(all_houses.shape)
print(all_labels.shape)

(8906034, 3)
(931117, 3)


In [13]:
all_houses.isna().sum()

Date         0
aggregate    0
id           0
dtype: int64

## Extracting Comprehensive Time Series Features

In [None]:
start = 0
end = 155187
batches = []
for i in range(0,6):
    batches.append(all_houses[(all_houses['id'] >= start) & (all_houses['id'] < end)])
    start += 155187
    end += 155187

sum([b.shape[0] for b in batches])

8906034

In [None]:
import time
features = []
for batch in batches:
    X = extract_features(batch, column_id="id", column_sort="Date", default_fc_parameters=ComprehensiveFCParameters(), impute_function=impute)
    print('Shape: {}'.format(X.shape))
    features.append(X)
    time.sleep(60)

Feature Extraction: 100%|██████████| 20/20 [32:57<00:00, 98.86s/it]


Shape: (155187, 789)


Feature Extraction: 100%|██████████| 20/20 [33:47<00:00, 101.37s/it]


Shape: (155187, 789)


Feature Extraction: 100%|██████████| 20/20 [32:56<00:00, 98.83s/it]


Shape: (155187, 789)


Feature Extraction: 100%|██████████| 20/20 [33:23<00:00, 100.15s/it]


Shape: (155187, 789)


Feature Extraction: 100%|██████████| 20/20 [34:08<00:00, 102.41s/it]


Shape: (155187, 789)


Feature Extraction: 100%|██████████| 20/20 [35:25<00:00, 106.28s/it]


Shape: (155182, 789)


In [None]:
for batch in features:
    print(batch.isna().sum().sum())

0
0
0
0
0
0


In [None]:
all_features = pd.concat(features)
print(all_features.shape)

(931117, 789)


In [16]:
all_data = pd.concat([all_features,all_labels], axis = 1)
print(all_data.shape)

(931117, 792)


In [17]:
all_data

Unnamed: 0,aggregate__variance_larger_than_standard_deviation,aggregate__has_duplicate_max,aggregate__has_duplicate_min,aggregate__has_duplicate,aggregate__sum_values,aggregate__abs_energy,aggregate__mean_abs_change,aggregate__mean_change,aggregate__mean_second_derivative_central,aggregate__median,...,"aggregate__matrix_profile__feature_""min""__threshold_0.98","aggregate__matrix_profile__feature_""max""__threshold_0.98","aggregate__matrix_profile__feature_""mean""__threshold_0.98","aggregate__matrix_profile__feature_""median""__threshold_0.98","aggregate__matrix_profile__feature_""25""__threshold_0.98","aggregate__matrix_profile__feature_""75""__threshold_0.98",aggregate__mean_n_absolute_max__number_of_maxima_7,1,2,3
0.0,0.0,1.0,1.0,1.0,1501.0,250335.0,0.125000,-0.125000,0.000000,167.0,...,0.0,0.0,0.0,0.0,0.0,0.0,167.000000,0,all_off,all_off
1.0,1.0,0.0,0.0,1.0,1674.0,280238.0,0.888889,0.000000,-0.062500,167.0,...,0.0,0.0,0.0,0.0,0.0,0.0,167.714286,0,all_off,all_off
2.0,0.0,1.0,0.0,1.0,1682.0,282916.0,0.888889,0.000000,-0.062500,168.0,...,0.0,0.0,0.0,0.0,0.0,0.0,168.428571,0,all_off,all_off
3.0,0.0,1.0,1.0,1.0,1677.0,281235.0,0.111111,-0.111111,0.000000,168.0,...,0.0,0.0,0.0,0.0,0.0,0.0,168.000000,0,all_off,all_off
4.0,1.0,0.0,0.0,1.0,1675.0,280573.0,1.111111,0.222222,0.000000,167.0,...,0.0,0.0,0.0,0.0,0.0,0.0,167.857143,0,all_off,all_off
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
931112.0,1.0,0.0,0.0,0.0,6957.0,6174227.0,150.857143,38.000000,-18.666667,922.5,...,0.0,0.0,0.0,0.0,0.0,0.0,903.000000,0,washing_machine,washing_machine
931113.0,1.0,0.0,0.0,1.0,6997.0,5099797.0,85.222222,-1.000000,-21.062500,631.0,...,0.0,0.0,0.0,0.0,0.0,0.0,733.142857,0,washing_machine,washing_machine
931114.0,1.0,0.0,0.0,1.0,6420.0,4167806.0,64.777778,0.333333,0.812500,619.0,...,0.0,0.0,0.0,0.0,0.0,0.0,658.142857,0,washing_machine,washing_machine
931115.0,1.0,0.0,0.0,1.0,6187.0,3829803.0,15.777778,-0.222222,0.000000,617.5,...,0.0,0.0,0.0,0.0,0.0,0.0,625.000000,0,washing_machine,washing_machine


In [18]:
all_data.to_csv('processed_data/125_tsfresh_labeled.csv')