In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import datetime
import csv
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.cluster import KMeans
from datetime import timedelta, date
%matplotlib inline

### Method to get all channel file paths from the house directory

In [2]:
def get_channel_files(house_path):
    """
    Get channel files from the house directory. 
    
    Input:
    
    house_path = Path to house folder/directory
    
    Output:
    
    filepath_array = Array of file paths 
    
    """
    if(house_path[-1] != '/'):
        house_path = house_path + '/'
    filepath_array = []
    for x in os.listdir(house_path):
        if 'channel_' in x and 'button' not in x and x != "channel_1.dat":
            filepath_array.append(house_path + x)
    return filepath_array

### Method to read a channel file and add it to a dataframe

In [3]:
def read_channel_file(filepath):
    """
    This method reads channel file (.dat) using file path and returns a dataframe.
    
    Input:
    
    filepath = Path of the input channel (.dat) file
    
    Output:
    
    channel_df = Channel dataframe
    
    """
    channel_df = pd.read_csv(filepath, sep='\\s+', names=['Timestamp','Reading'], parse_dates=['Timestamp'], header=0)
    return channel_df

### Method to resample the channel usage in given time intervals

In [4]:
def resampling(input_df, time):
    """
    This method takes channel usage dataframe and time interval as input 
    and resamples the data by the input time. 
    
    Input:
    
    input_df = Channel usage dataframe
    time = time interval for resampling
    
    Output:
    
    final_data = Resampled dataframe
    
    """
    dataframe = input_df.set_index('Timestamp')
    dataframe.index = pd.to_datetime(dataframe.index,unit = "s")
    resample = dataframe.resample(time)
    resampled_data = resample.mean()
    final_data = resampled_data.reset_index()
    final_data = final_data.fillna(0)
    return final_data

In [5]:
def apply_kmeans(column):
    """
    This method takes channel readings column as input and applies K-Means clustering algorithm
    with 2 clusters - On/Off.  
    
    Input:    
    column = 1-d array of readings
    
    Output:    
    x = original column but reshaped
    km = kmeans object
    
    """
    x = np.array(column)
    km = KMeans(n_clusters=2)
    res = km.fit(x.reshape(-1,1))
    return x, km

In [6]:
def get_clusters(x, km, timeindex):
    """
    This method returns clusters resulted from the K-Means algorithm. 
    
    Input:
    x = Readings array
    km = K-Means algo object
    timeindex = list of timestamps
    
    Output:
    cluster_1 = Cluster of timestamps when device is Off
    cluster_2 = Cluster of timestamps when device is On
    times = Array of On/Off sequence for an appliance
    
    """
    times_1 = []
    times_2 = []
    cluster_1 = []
    cluster_2 = []
    for i in range(len(km.labels_)):
        if(km.labels_[i] == 0):
            cluster_1.append(x[i])
            times_1.append('0')
            times_2.append(str(timeindex[i]))
        else:
            cluster_2.append(x[i])
            times_1.append(str(timeindex[i]))
            times_2.append('0')
            
    if cluster_1[0] < cluster_2[0]:
        return cluster_1, cluster_2, times_1
    if cluster_1[0] > cluster_2[0]:
        return cluster_2, cluster_1, times_2

### Method to resample and generate channel's on/off status data

In [7]:
def get_channel_on_off_data(filepath_list, output_files_location):
    """
    This method iterates over each channel in the house,
    resamples the input channel usage data, 
    categorizes each instance of resampled data into On/Off states,
    creates an array from it and saves it into a .npy file

    Input:

    filepath_list = List of paths of the channel (.dat) files from any house.
    resampling_time = String denoting the time interval for resampling (30min).
    output_files_location = Location of output .npy files.

    Returns:
    Creates .npy files on the specified path
    returns None

    """
    for file in filepath_list:
        df = read_channel_file(file)
        resampled_data = resampling(df, resampling_time_in_min)
        resampled_data = resampled_data.fillna(0)
        x, km = apply_kmeans(resampled_data['Reading'])
        cluster_1, cluster_2, times = get_clusters(x, km, resampled_data.Timestamp)
        filename = file.split('/')[-1].split('.')[0]+'.npy'
        np.save(output_files_location+filename, times)

### Declare file paths and resampling time

In [8]:
house = 2

print("House : " + str(house))

path_to_house = "../../../../Dataset/ukdale/House_" + str(house) + "/"
output_file_path = "./Channel_On_Off_data/House_" + str(house) + "/"
resampling_time_in_min = "30min"

print("path_to_house : " + path_to_house)
print("output_file_path : " + output_file_path)
print("resampling_time_in_min : " + resampling_time_in_min)

House : 2
path_to_house : ../../../../Dataset/ukdale/House_2/
output_file_path : ./Channel_On_Off_data/House_2/
resampling_time_in_min : 30min


### Get the list of channel files for a house

In [9]:
filepath_list = get_channel_files(path_to_house)

### Resample data and then save channel On Off data into .npy files

In [10]:
get_channel_on_off_data(filepath_list, output_file_path)