In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime
from datetime import timedelta, date
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split 
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

ModuleNotFoundError: No module named 'xgboost'

In [None]:
def get_channel_files(house_path):
    """
    Get channel files from the house directory. 
    
    Input:
    
    house_path = Path to house folder/directory
    
    Output:
    
    filepath_array = Array of file paths 
    
    """
    if(house_path[-1] != '/'):
        house_path = house_path + '/'
    filepath_array = []
    for x in os.listdir(house_path):
        if 'channel_' in x and x != "channel_1.dat":
            filepath_array.append(house_path + x)
    return filepath_array

def read_channel_file(filepath):
    """
    This method reads channel file (.dat) using file path and returns a dataframe.
    
    Input:
    
    filepath = Path of the input channel (.dat) file
    
    Output:
    
    channel_df = Channel dataframe
    
    """
    channel_df = pd.read_csv(filepath, sep='\\s+', names=['Timestamp','Reading'], parse_dates=['Timestamp'], header=0)
    return channel_df

def resampling(input_df, time):
    """
    This method takes channel usage dataframe and time interval as input 
    and resamples the data by the input time. 
    
    Input:
    
    input_df = Channel usage dataframe
    time = time interval for resampling
    
    Output:
    
    final_data = Resampled dataframe
    
    """
    dataframe = input_df.set_index('Timestamp')
    dataframe.index = pd.to_datetime(dataframe.index,unit = "s")
    resample = dataframe.resample(time)
    resampled_data = resample.mean()
    final_data = resampled_data.reset_index()
    final_data = final_data.fillna(0)
    return final_data

def apply_kmeans(column):
    """
    This method takes channel readings column as input and applies K-Means clustering algorithm
    with 2 clusters - On/Off.  
    
    Input:    
    column = 1-d array of readings
    
    Output:    
    x = original column but reshaped
    km = kmeans object
    
    """
    x = np.array(column)
    km = KMeans(n_clusters=2)
    res = km.fit(x.reshape(-1,1))
    return x, km

def get_clusters(x, km, timeindex):
    """
    This method returns clusters resulted from the K-Means algorithm. 
    
    Input:
    x = Readings array
    km = K-Means algo object
    timeindex = list of timestamps
    
    Output:
    cluster_1 = Cluster of timestamps when device is Off
    cluster_2 = Cluster of timestamps when device is On
    times = Array of On/Off sequence for an appliance
    
    """
    times_1 = []
    times_2 = []
    cluster_1 = []
    cluster_2 = []
    for i in range(len(km.labels_)):
        if(km.labels_[i] == 0):
            cluster_1.append(x[i])
            times_1.append('0')
            times_2.append(str(timeindex[i]))
        else:
            cluster_2.append(x[i])
            times_1.append(str(timeindex[i]))
            times_2.append('0')
            
    if cluster_1[0] < cluster_2[0]:
        return cluster_1, cluster_2, times_1
    if cluster_1[0] > cluster_2[0]:
        return cluster_2, cluster_1, times_2
    

def get_counts(channel_id):
    channel_data = np.load()
    count = 0
    for i in channel_data:
        if i == '0':
            count+=1
    print(len(channel_data) - count)
    print(count)        

    
def get_channel_on_off_data(filepath_list, output_files_location):
    """
    This method iterates over each channel in the house,
    resamples the input channel usage data, 
    categorizes each instance of resampled data into On/Off states,
    creates an array from it and saves it into a .npy file

    Input:

    filepath_list = List of paths of the channel (.dat) files from any house.
    resampling_time = String denoting the time interval for resampling (30min).
    output_files_location = Location of output .npy files.

    Returns:
    Creates .npy files on the specified path
    returns list of datetimes where appliance is ON and 0 if its OFF

    """
    channel_status_dict = dict()
    for file in filepath_list:
        if('button' in file):
            continue
        df = read_channel_file(file)
        resampled_data = resampling(df, resampling_time_in_min)
        resampled_data = resampled_data.fillna(0)
        x, km = apply_kmeans(resampled_data['Reading'])
        cluster_1, cluster_2, times = get_clusters(x, km, resampled_data.Timestamp)
        channel_name = file.split("/")[-1][:-4]
        channel_status_dict[channel_name] = times
    return channel_status_dict

def get_labels(filepath):
    """
    This method takes label file path as input and returns a dataframe with channel and appliance mappings
    
    Input:
    
    Label data file path.
    
    Output:
    
    Channel-Appliance name dataframe.
    
    """
    
    labels_df = pd.read_csv(filepath, sep='\\s+', names=['Channel_id','Appliance'])
    labels_df["Channel_id"] = ["channel_"+str(i) for i in range(1,labels_df.shape[0]+1)]
    labels_dict = dict()
    for row in labels_df.iterrows():
        labels_dict[row[1]["Channel_id"]] = row[1]["Appliance"]
    return labels_df, labels_dict

def fit_channel_to_mains_timeframe(channel_data, datetime_range):
    """
    This method fits all channels in given datetime range
    
    Input:
    
    channel_data = input channels 
    datetime_range = Time range
    
    Output:
    
    channel_statuses = return channels
    
    """
    channel_statuses = []
    for datetime in datetime_range:
        if(datetime in channel_data):
            channel_statuses.append(1)
        else:
            channel_statuses.append(0)
    return channel_statuses

def read_mains_data(mains_filepath, house):
    """
    This method reads mains data of house
    
    Input:
    
    Mains data file path.
    
    Output:
    
    Mains dataframe
    
    """
    print("Reading Mains Data")
    if house == 3 or house == 4:
        mains_df = pd.read_csv(mains_filepath, sep='\\s+', names=['Timestamp','Reading_1'], parse_dates=['Timestamp'], header=0)
    else:
        mains_df = pd.read_csv(mains_filepath, sep='\\s+', names=['Timestamp','Reading_1','Reading_2','Reading_3'], parse_dates=['Timestamp'], header=0)
    return mains_df


def get_appliances_from_channels(list_of_channels, labels_df):
    """
    This method returns a list of appliance names given a list of channels.
    
    Input:
    list_of_channels = List of channels for which names to be extracted.
    labels_df = Dataframe of channel ids and appliance names.
    
    Output:
    List of appliance names corresponding to each channel in list_of_channels
    """
    result = []
    for item in list_of_channels:
        result.append(labels_df.set_index('Channel_id').at[item, 'Appliance'])
    return result

def get_channel_list(house_path):
    """
    This method gets the list of channels from the house directory.
    
    Input:
    house_path = path to the house directory.
    
    Output:
    channel_list = list of all the channels except channel_1 (mains).
    """
    if(house_path[-1] != '/'):
        house_path = house_path + '/'
    channel_list = []
    for item in os.listdir(house_path):
        if 'channel_' in item and 'button' not in item and item != "channel_1.dat":
            channel_list.append(item[:-4])
    return channel_list

def apply_day_type(df):
    temp2 = df['Timestamp'].apply(add_day_type)
    df['Weekend'] = temp2
    
def convert_decimal(binary_str):
    return int(binary_str, 2)

def weekday_status(time):
    """
    This method returns if the given timestamp is a weekday or not
    
    Input:
    Timestamp 
    
    Output:
    return status as 1 if weekday or 0 if weekend 
    """
    day_number=time.dayofweek
    if day_number == 5 or day_number == 6:
        return 0
    else:
        return 1

def get_hour(time):
    """
    This method returns hour data from datetime 
    
    Input:
    Datetime 
    
    Output:
    Hour data
    """
    return time.hour

def get_min(time):
    """
    This method returns minute data from datetime 
    
    Input:
    Datetime 
    
    Output:
    Minute data
    """
    return time.minute

def get_sec(time):
    """
    This method returns second data from datetime 
    
    Input:
    Datetime
    
    Output:
    Second data
    """
    return time.second

def get_binary(target, length):
    """
    This method returns if binary equivalent of target decimal value with given length
    
    Input:
    
    target: Decimal value
    length: length of binary needed
    
    Output:
    binary equivalent of target decimal value with given length
    """
    target_len= '{0:0'+str(length)+'b}'
    return [int(target) for target in list(target_len.format(target))]


def get_dates_list(min_date, max_date):
    """
    This method finds the dates between a given set of dates.
    
    Input:
    date1 = Start date
    date2 = End date
    
    Output:
    List of date strings in between these dates
    """
    Dates = []
    start_dt = min_date
    end_dt = max_date
    for dt in daterange(start_dt, end_dt):
        Dates.append(dt.strftime("%Y-%m-%d"))
    return Dates

def daterange(date1, date2):
    """
    This method finds the dates between a given set of dates.
    
    Input:
    date1 = Start date
    date2 = End date
    
    Output:
    List of dates in between these dates
    """
    for n in range(int ((date2 - date1).days)+1):
        yield date1 + timedelta(n)
        
def divide_data_into_time(times, apriori_data):
    """
    This method seperates ON appliance sequences by time and Saves them in a dictionary with time slices as the key
    
    Input:
    times = List of time slices.
    apriori_data = Output of the data_extractor().
    
    Output:
    Dataframe of support and frequent itemsets.
    """
    time_sequence_dict = dict()
    for timestamp in times:
        sequence_list = []
        for data in apriori_data:
            if timestamp in data:
                sequence_list.append(data)
        time_sequence_dict[timestamp] = sequence_list
    return time_sequence_dict

def get_support_and_itemsets(apriori_data, minimum_support):
    """
    This method runs the apriori algorithm on the input data and returns frequent itemsets with their respective supports
    
    Input:
    apriori_data = List-of-lists of appliances On at a particular time for each day.
    minimum_support = Minimum support for getting the frequent itemsets.
    
    Output:
    Dataframe of support and frequent itemsets
    """
    te = TransactionEncoder()
    data = te.fit(apriori_data).transform(apriori_data)
    data = pd.DataFrame(data, columns = te.columns_)
    return apriori(data, min_support = minimum_support, use_colnames = True)

def get_channels_from_frequent_itemsets(frequent_itemsets_df):
    """
    This method returns the set of unique channels present in the support-frequent itemsets dataframe.
    This method was used to check the appliance coverage.
    
    Input:
    frequent_itemsets_df = Dataframe of support and frequent itemsets.
    
    Output:
    Set of channels present in the entire frequent itemsets
    """
    channels = set()
    for item in frequent_itemsets_df.itemsets:
        for entry in list(item):
            channels.add(entry)
    return channels

def get_appliances_from_channels(list_of_channels, labels_df):
    """
    This method returns a list of appliance names given a list of channels.
    
    Input:
    list_of_channels = List of channels for which names to be extracted.
    labels_df = Dataframe of channel ids and appliance names.
    
    Output:
    List of appliance names corresponding to each channel in list_of_channels
    """
    result = []
    for item in list_of_channels:
        result.append(labels_df.set_index('Channel_id').at[item, 'Appliance'])
    return result

def get_channels_from_rules(rule_df):
    """
    This method returns the set of unique channels present in the time-rules dataframe.
    This method was used to check the appliance coverage.
    
    Input:
    rules_df = Dataframe of time and appliances On.
    
    Output:
    Set of channels present in the rules dataframe
    """
    channel_list = []
    for sequence in rule_df['consequents']:
        list_of_channels = str(sequence)[12:-3].replace("'",'').split(", ")
        for item in list_of_channels:
            if item not in channel_list:
                channel_list.append(item)
    return channel_list

def get_missing_channel_list(rule_df, full_channel_set):
    """
    This method returns the set of unique channels not present in the time-rules dataframe.
    This method was used to check the appliance coverage.
    
    Input:
    rules_df = Dataframe of time and appliances On.
    full_channel_set = Set of complete channels in the house
    
    Output:
    Set of channels not present in the rules dataframe
    """
    channel_set = set()
    for sequence in rule_df['consequents']:
        list_of_channels = sequence[12:-3].replace("'",'').split(", ")
        channel_set.update(list_of_channels)
    result_1 = full_channel_set - channel_set
    return result_1

def change_names(recs, show_name_dict):
    res = []
    for item in recs:
        res.append(show_name_dict[item])
    return res

In [None]:
def get_display_names_dict(labels_to_name_file):
    display_names_dict = dict()
    display_names_df = pd.read_csv(labels_to_name_file, names= ["Labels", "Name"], header = 0)
    for index, row in display_names_df.iterrows():
        display_names_dict[row.Labels] = row.Name
    return display_names_dict

In [2]:
# House 2 Lables

# show_name_dict = dict()
# show_name_dict['laptop'] = "Laptop"
# show_name_dict['monitor'] = "Monitor"
# show_name_dict['speakers'] = "Speakers"
# show_name_dict['server'] = "Server"
# show_name_dict['router'] = "Router"
# show_name_dict['server_hdd'] = "Server_hdd"
# show_name_dict['kettle'] = "Kettle"
# show_name_dict['rice_cooker'] = "Rice Cooker"
# show_name_dict['running_machine'] = "Running Machine"
# show_name_dict['laptop2'] = "Laptop2"
# show_name_dict['washing_machine'] = "Washing Machine"
# show_name_dict['dish_washer'] = "Dish Washer"
# show_name_dict['fridge'] = "Fridge"
# show_name_dict['microwave'] = "Microwave"
# show_name_dict['toaster'] = "Toaster"
# show_name_dict['playstation'] = "Playstation"
# show_name_dict['modem'] = "Modem"
# show_name_dict['cooker'] = "Cooker"

In [214]:
house = 2

print("House : " + str(house))

path_to_house = "C:/Users/pagara/Documents/Assignment/ukdale/house_" + str(house) + "/"
labels_file = path_to_house + "labels.dat"
channel_status_data_dir = "Channel_status_data/House_" + str(house)
if house ==3 or house ==4:
    mains_channel_filepath = path_to_house + "channel_1.dat"
else: 
    mains_channel_filepath = path_to_house + "mains.dat"
resampling_time_in_min = "2min"
output_file_path = "house"+str(house)+"/"
channel_list = get_channel_list(path_to_house)
intermediate_file_path = "house"+str(house)+"_intermediate/"
output_dir = "XGBoost/house"+str(house)+"_final/"
labels_to_name_file = "../Labels_to_name_files/House_" + str(house) + ".csv"

if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)
if not os.path.exists(channel_status_data_dir):
    os.makedirs(channel_status_data_dir)
if not os.path.exists(intermediate_file_path):
    os.makedirs(intermediate_file_path)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)    
    
print("path_to_house : " + path_to_house)
print("labels_file : " + labels_file)
print("channel_status_data_dir : " + channel_status_data_dir)
print("mains_channel_filepath : " + mains_channel_filepath)
print("resampling_time_in_min : " + resampling_time_in_min)
print("Intermediate files directory : " + intermediate_file_path)

House : 2
path_to_house : C:/Users/pagara/Documents/Assignment/ukdale/house_2/
labels_file : C:/Users/pagara/Documents/Assignment/ukdale/house_2/labels.dat
channel_status_data_dir : Channel_status_data/House_2
mains_channel_filepath : C:/Users/pagara/Documents/Assignment/ukdale/house_2/mains.dat
resampling_time_in_min : 2min
Intermediate files directory : house2_intermediate/


In [None]:
show_name_dict = get_display_names_dict(labels_to_name_file)

In [215]:
channel_list

['channel_10',
 'channel_11',
 'channel_12',
 'channel_13',
 'channel_14',
 'channel_15',
 'channel_16',
 'channel_17',
 'channel_18',
 'channel_19',
 'channel_2',
 'channel_3',
 'channel_4',
 'channel_5',
 'channel_6',
 'channel_7',
 'channel_8',
 'channel_9']

In [None]:
labels_df, labels_map = get_labels(labels_file)

mains_data = read_mains_data(mains_channel_filepath, house)
resampled_mains_data = resampling(mains_data, resampling_time_in_min)
resampled_mains_data = resampled_mains_data.set_index('Timestamp')

print("Done resampling for mains file")

filepath_list = get_channel_files(path_to_house)
channel_status_dict = get_channel_on_off_data(filepath_list, output_file_path)

print("Done processing channel files")

updated_channel_status_dict = dict()
datetime_range = resampled_mains_data.index.astype('str')
for channel in channel_status_dict.keys():
    updated_channel_status_dict[channel] = fit_channel_to_mains_timeframe(channel_status_dict[channel], datetime_range)
    print("Done for channel : " + channel)

for key in updated_channel_status_dict.keys():
    filename = key + '.npy'
    filepath = channel_status_data_dir + "/" + filename
    np.save(filepath, updated_channel_status_dict[key])

print("House " + str(house) + " finished")

Reading Mains Data
Done resampling for mains file
Done processing channel files
Done for channel : channel_10
Done for channel : channel_11
Done for channel : channel_12
Done for channel : channel_13
Done for channel : channel_14
Done for channel : channel_15
Done for channel : channel_16
Done for channel : channel_17
Done for channel : channel_18
Done for channel : channel_19
Done for channel : channel_2
Done for channel : channel_3
Done for channel : channel_4
Done for channel : channel_5


In [None]:
labels_map

### Resampled Mains data dataframe

In [153]:
resampled_mains_data.head()

Unnamed: 0_level_0,Reading_1
Timestamp,Unnamed: 1_level_1
2013-03-09 14:40:00,628.5
2013-03-09 14:42:00,2684.8
2013-03-09 14:44:00,822.684211
2013-03-09 14:46:00,576.055556
2013-03-09 14:48:00,2823.157895


### Create a dataframe with all appliance status data and datetimerange as index

In [154]:
all_channel_status_df = pd.DataFrame(updated_channel_status_dict, index = resampled_mains_data.index)

In [155]:
resampled_mains_data.to_csv(intermediate_file_path+"resampled_mains_data.csv")
all_channel_status_df.to_csv(intermediate_file_path+"all_channel_status_df.csv")

In [156]:
resampled_mains_data=pd.read_csv(intermediate_file_path+"resampled_mains_data.csv", sep=',',parse_dates=['Timestamp'] ,header=0, index_col="Timestamp")
all_channel_status_df=pd.read_csv(intermediate_file_path+"all_channel_status_df.csv", sep=',',parse_dates=['Timestamp'], header=0, index_col="Timestamp")

In [157]:
resampled_mains_data

Unnamed: 0_level_0,Reading_1
Timestamp,Unnamed: 1_level_1
2013-03-09 14:40:00,628.500000
2013-03-09 14:42:00,2684.800000
2013-03-09 14:44:00,822.684211
2013-03-09 14:46:00,576.055556
2013-03-09 14:48:00,2823.157895
...,...
2013-10-01 05:06:00,242.950000
2013-10-01 05:08:00,268.368421
2013-10-01 05:10:00,267.750000
2013-10-01 05:12:00,266.947368


In [158]:
all_channel_status_df

Unnamed: 0_level_0,channel_2,channel_3,channel_4,channel_5,channel_6
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-03-09 14:40:00,0,0,1,1,0
2013-03-09 14:42:00,0,0,1,1,0
2013-03-09 14:44:00,0,0,1,1,0
2013-03-09 14:46:00,0,0,1,1,0
2013-03-09 14:48:00,0,0,1,1,0
...,...,...,...,...,...
2013-10-01 05:06:00,0,0,0,1,0
2013-10-01 05:08:00,0,0,0,1,0
2013-10-01 05:10:00,0,0,0,1,0
2013-10-01 05:12:00,0,0,0,1,0


## Generate Target Value column using appliance on/off data

In [159]:
all_channel_status_df=all_channel_status_df.astype(str)
all_channel_status_df['Binary'] = all_channel_status_df.values.sum(axis=1)
all_channel_status_df["Target"]=all_channel_status_df.apply(lambda row: convert_decimal(row.Binary), axis=1)

In [160]:
all_channel_status_df

Unnamed: 0_level_0,channel_2,channel_3,channel_4,channel_5,channel_6,Binary,Target
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-03-09 14:40:00,0,0,1,1,0,00110,6
2013-03-09 14:42:00,0,0,1,1,0,00110,6
2013-03-09 14:44:00,0,0,1,1,0,00110,6
2013-03-09 14:46:00,0,0,1,1,0,00110,6
2013-03-09 14:48:00,0,0,1,1,0,00110,6
...,...,...,...,...,...,...,...
2013-10-01 05:06:00,0,0,0,1,0,00010,2
2013-10-01 05:08:00,0,0,0,1,0,00010,2
2013-10-01 05:10:00,0,0,0,1,0,00010,2
2013-10-01 05:12:00,0,0,0,1,0,00010,2


## Generate weekday, hour, minute, second data from mains power data

In [161]:
if house == 3 or house == 4:
    power_df= pd.DataFrame(resampled_mains_data["Reading_1"])
    power_df=power_df.rename(columns={"Reading_1": "Power"})
else:
    power_df= pd.DataFrame(resampled_mains_data["Reading_2"])
    power_df=power_df.rename(columns={"Reading_2": "Power"})

power_df["Weekday"]=power_df.apply(lambda row: weekday_status(row.name), axis=1)
power_df["Hour"]=power_df.apply(lambda row: int(get_hour(row.name)), axis=1)
power_df["Minute"]=power_df.apply(lambda row: int(get_min(row.name)), axis=1)
power_df["Second"]=power_df.apply(lambda row: int(get_sec(row.name)), axis=1)

In [162]:
power_df

Unnamed: 0_level_0,Power,Weekday,Hour,Minute,Second
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-03-09 14:40:00,628.500000,0,14,40,0
2013-03-09 14:42:00,2684.800000,0,14,42,0
2013-03-09 14:44:00,822.684211,0,14,44,0
2013-03-09 14:46:00,576.055556,0,14,46,0
2013-03-09 14:48:00,2823.157895,0,14,48,0
...,...,...,...,...,...
2013-10-01 05:06:00,242.950000,1,5,6,0
2013-10-01 05:08:00,268.368421,1,5,8,0
2013-10-01 05:10:00,267.750000,1,5,10,0
2013-10-01 05:12:00,266.947368,1,5,12,0


## Split the dataset to 50% to train and test data

In [163]:
X_train, X_test, y_train, y_test = train_test_split(power_df,all_channel_status_df["Target"], test_size=0.5, random_state=None, shuffle=False)

In [164]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(74019, 5)
(74019, 5)
(74019,)
(74019,)


## Instantiate classifier and train the model 

In [165]:
model = XGBClassifier()

In [166]:
#Training on 50% data
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

## Prediction

In [167]:
#Predict on complete data
y_pred = model.predict(X_test)

## Disintegrate data to appliance level data[Decimal to binary]

In [201]:
pred_df= pd.DataFrame({"Target":y_pred}, index=y_test.index)
pred_df["Appliance_Binary_data"]=pred_df.apply(lambda row: get_binary(row.Target, len(channel_list)), axis=1)
predicted_df=pd.DataFrame(pred_df["Appliance_Binary_data"].to_list(), columns=channel_list, index=pred_df.index)
# predicted_df=predicted_df.rename(columns=labels_map)

In [202]:
predicted_df

Unnamed: 0_level_0,tv_dvd_digibox_lamp,kettle_radio,gas_boiler,freezer,washing_machine_microwave_breadmaker
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-06-20 09:58:00,0,0,0,1,0
2013-06-20 10:00:00,0,0,0,1,0
2013-06-20 10:02:00,0,0,0,0,0
2013-06-20 10:04:00,0,0,0,0,0
2013-06-20 10:06:00,0,0,0,0,0
...,...,...,...,...,...
2013-10-01 05:06:00,0,0,1,0,0
2013-10-01 05:08:00,0,0,0,1,0
2013-10-01 05:10:00,0,0,0,1,0
2013-10-01 05:12:00,0,0,0,1,0


In [203]:
predicted_df.to_csv("XGBoost_Classified_Appliance_Prediction.csv")

In [204]:
act_df= pd.DataFrame({"Target":y_test}, index=y_test.index)
act_df["Appliance_Binary_data"]=act_df.apply(lambda row: get_binary(row.Target, len(channel_list)), axis=1)
actual_df=pd.DataFrame(act_df["Appliance_Binary_data"].to_list(), columns=channel_list, index=act_df.index)
actual_df=actual_df.rename(columns=labels_map)

In [205]:
actual_df

Unnamed: 0_level_0,tv_dvd_digibox_lamp,kettle_radio,gas_boiler,freezer,washing_machine_microwave_breadmaker
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-06-20 09:58:00,0,0,0,1,0
2013-06-20 10:00:00,0,0,0,1,0
2013-06-20 10:02:00,0,0,0,1,0
2013-06-20 10:04:00,0,0,0,0,0
2013-06-20 10:06:00,0,0,0,0,0
...,...,...,...,...,...
2013-10-01 05:06:00,0,0,0,1,0
2013-10-01 05:08:00,0,0,0,1,0
2013-10-01 05:10:00,0,0,0,1,0
2013-10-01 05:12:00,0,0,0,1,0


## Evaluation

In [206]:
results=[]
for column in actual_df:
    dict_app={}
    dict_app["Appliance"]= column
    dict_app['Accuracy'] = accuracy_score(actual_df[column], predicted_df[column]) 
    dict_app['Precision'] = precision_score(actual_df[column], predicted_df[column], average="macro")
    dict_app['Recall'] = recall_score(actual_df[column], predicted_df[column], average="macro")
    dict_app['F1'] = f1_score(actual_df[column], predicted_df[column], average="macro")
    results.append(dict_app)       
metrics = pd.DataFrame(results)
metrics = metrics.set_index("Appliance")
# metrics.index.name= metrics["Appliance"

In [207]:
metrics

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1
Appliance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tv_dvd_digibox_lamp,0.96328,0.70447,0.648308,0.671384
kettle_radio,0.994961,0.67904,0.614267,0.639391
gas_boiler,0.881693,0.662519,0.794214,0.700029
freezer,0.835785,0.748089,0.766679,0.756537
washing_machine_microwave_breadmaker,0.996177,0.658345,0.615319,0.633409


In [208]:
metrics.to_csv("House_"+house+"_XGBoost_metrics.csv")

## Recommendation generation

In [4]:
# predicted_df.head()

In [1]:
# Adding recommendations related variables
house = 'House_2'
min_support = 0.02
min_confidence = 0.02
considered_rules = 200
resampling_time_in_min = '30'
rule_files_dir = "./Rule_Files/" + house + "/"
recommendations_file = "./Recommendations/" + house + "/recommendations_" + str(min_support) + "_" + str(min_confidence) + ".csv"

In [None]:
predicted_df['Timestamp'] = list(predicted_df.index)
resampled_recs_data = resampling(predicted_df, "30min")

In [None]:
channel_data_dict = dict()

for index, row in resampled_recs_data.iterrows():
    for column in resampled_recs_data:
        if(column == 'Timestamp'):
            continue
        if column not in channel_data_dict:
            channel_data_dict[column] = []
        if(row[column] > 0.5):
            channel_data_dict[column].append(str(row.Timestamp))
                

In [None]:
min_date = list(resampled_recs_data['Timestamp'])[0].date()
max_date = list(resampled_recs_data['Timestamp'])[-1].date()

In [None]:
Dates = get_dates_list(min_date, max_date)
Time = get_all_times_of_day("30")

In [None]:
apriori_dt = data_extractor(channel_list, channel_data_dict, Dates, Time)

In [None]:

# Generate frequent itemsets and rules one time slice at a time and save them in a dictionary. 

time_itemset_map = dict()
time_rules_map = dict()
time_channels_map_from_itemsets = dict()
time_channels_map_from_rules = dict()

time_appliance_map = divide_data_into_time(Time, apriori_dt)

for timestamp in list(time_appliance_map.keys()):
    
    print("Generating Itemsets for : " + str(timestamp))
    # Generate frequent itemsets
    time_itemset_map[timestamp] = get_support_and_itemsets(time_appliance_map[timestamp], min_support)
    
#     print("Generating Rules for : " + str(timestamp))
    # Generate rules
    time_rules_map[timestamp] = association_rules(time_itemset_map[timestamp], metric="confidence", min_threshold = min_confidence)
#     print(time_rules_map[timestamp].shape)
    
#     print("Filtering Rules for : " + str(timestamp))
    # Filter rules which starts from current time slice
    rules_df = time_rules_map[timestamp]
    time_rules_map[timestamp] = rules_df[rules_df['antecedents'] == frozenset({timestamp})]
    
    # Get channels from frequent itemsets
    time_channels_map_from_itemsets[timestamp] = get_channels_from_frequent_itemsets(time_itemset_map[timestamp])
    
    # Get channels from rules
    time_channels_map_from_rules[timestamp] = get_channels_from_rules(time_rules_map[timestamp])


In [None]:
for timestamp, df in time_rules_map.items():
    df.to_csv(rule_files_dir + timestamp + ".csv", header=True, index=False)

In [None]:
recommendation_list = []
for timestamp in Time:
    rule_df = time_rules_map[timestamp].sort_values(by=['confidence'], ascending=False)
    recommended_channels = get_channels_from_rules(rule_df[:considered_rules])
    recommendations = get_appliances_from_channels(recommended_channels, labels_df)
    recommendations = change_names(recommendations, show_name_dict)
    recommendation_list.append(",".join(appliance for appliance in recommendations))
time_recommendation_df = pd.DataFrame({"Time" : Time, "Recommendations" : recommendation_list})

In [None]:
time_recommendation_df.to_csv(recommendations_file)