<a href="https://colab.research.google.com/github/neilgautam/APRIORI-ASSOCIATION_RULE_LEARNING-/blob/master/APRIORI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [448]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime
from datetime import timedelta, date
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [535]:
def get_labels(labels_file_path):
    """
    This method gets channel labels from the labels.dat file from the house directory
    
    Input:
    labels_file_path = Path to the labels.dat file.
    
    Output:
    labels_df = Channel and appliance name dataframe.
    labels_dict = Channel and appliance name dictionary.
    """
    labels_df = pd.read_csv(labels_file_path, sep='\\s+', names=['Channel_id', 'Appliance'])
    labels_df["Channel_id"] = ["channel_"+str(i) for i in range(1,labels_df.shape[0]+1)]
    labels_dict = dict()
    for row in labels_df.iterrows():
        labels_dict[row[1]["Channel_id"]] = row[1]["Appliance"]
    return labels_df, labels_dict

In [476]:
def get_channel_list(house_path):
    """
    This method gets the list of channels from the house directory.
    
    Input:
    house_path = path to the house directory.
    
    Output:
    channel_list = list of all the channels except channel_1 (mains).
    """
    if(house_path[-1] != '/'):
        house_path = house_path + '/'
    channel_list = []
    for item in os.listdir(path):
        if 'channel_' in item and item != "channel_1.dat":
            channel_list.append(item[:-4])
    return channel_list

In [478]:
def get_channel_data(on_off_data_dir_path, channel_list):
    """
    This method loads channel on/off data from .npy files 
    generated by running 'Resampling_and_generating_appliance_on_off_data' file 
    
    Input:
    file_path = Path to the .npy files for all the channels
    equipments = List of channels for the house being analyzed
    
    Output:
    equip_dict = Dictionary of equipment and the datetime data when the equipment was turned on
    """
    channel_dict = dict()
    for channel in channel_list:
        if 'channel_' in channel and channel != "channel_1.dat":    
            # print( on_off_data_dir_path + channel)
            channel_data = np.load(on_off_data_dir_path + channel + ".npy")
            channel_data = list(channel_data)
            # Filter out only those instances where appliance is on
            channel_data = [ x for x in channel_data if x!= '0']
            # Populating dictionary with channel data using channel name as key
            channel_dict[channel] = list(channel_data) 
    return channel_dict

In [14]:
# def get_channel_data(path, channel_list):
#     channel_data = channel_loader(path, channel_list)
#     return channel_data

In [517]:
def get_min_max_dates(path_to_resampled_channel_data, channel_list):
    """
    This method finds the range of dates in which all the appliances were recorded //
    and returns a min and a max date for given appliances. 
    
    Input:
    path_to_resampled_channel_data = Path to the resampled data directory
    number_of_channels = Total number of channels in the house
    
        
    Output:
    min_date = The earliest date on which an appliance usage was recorded.
    max_date = The latest date on which an appliance usage was recorded.
    """
    min_date = datetime.datetime.max.date()
    max_date = datetime.datetime.min.date()

    for i in reversed(range(2, len(channel_list) + 1)):
        cd = np.load(path_to_resampled_channel_data + "channel_"+str(i)+".npy")
        for item in cd:
            if(item != '0'):
                datetime_obj = datetime.datetime.strptime(item, "%Y-%m-%d %H:%M:%S")
                temp_date = datetime_obj.date()
                if(temp_date < min_date):
                    min_date = temp_date
                break;

        for item in reversed(cd):
            if(item != '0'):
                datetime_obj = datetime.datetime.strptime(item, "%Y-%m-%d %H:%M:%S")
                temp_date = datetime_obj.date()
                if(temp_date > max_date):
                    max_date = temp_date
                break;
    return min_date, max_date

In [537]:
def get_dates_list(min_date, max_date):
    """
    This method finds the dates between a given set of dates.
    
    Input:
    date1 = Start date
    date2 = End date
    
    Output:
    List of date strings in between these dates
    """
    Dates = []
    start_dt = min_date
    end_dt = max_date
    for dt in daterange(start_dt, end_dt):
        Dates.append(dt.strftime("%Y-%m-%d"))
    return Dates

In [538]:
def daterange(date1, date2):
    """
    This method finds the dates between a given set of dates.
    
    Input:
    date1 = Start date
    date2 = End date
    
    Output:
    List of dates in between these dates
    """
    for n in range(int ((date2 - date1).days)+1):
        yield date1 + timedelta(n)

In [539]:
def get_all_times_of_day(interval):
    """
    This method generates a list of times of a day seperated by specified interval.
    
    Input:
    interval = The gap between two neighboring time slots
    
    Output:
    Time = List of times seperated by specified interval
    """
    hour = ['00','01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23']
    minute = ['00', str(interval)]
    second = '00'
    Time = []
    for hr in hour:
        for min in minute:
            temp = [hr, min, second]
            temp = ':'.join(temp)
            Time.append(temp)
    return Time

## Marked for removal

In [3]:
def DateString():
    date = input("Enter Date : ")
    month = input("Enter Month : ")
    year = input("Enter Year : ")
    Date= [date,month,year]
    Date ='-'.join(Date) 
    return Date

In [494]:
def get_usage_data_for_day(channel_data_dict, channel_list, date):
    """
    This method extracts the appliances that are on and their usage data for a particular day. 
    
    Input:
    channel_data_dict = Dictionary of channel and their on/off data.
    channel_list = List of channels for this house
    date = Date for which used appliances and their usage needs to be extracted
    
    Output:
    cleaned_day_data = Dictionary of appliances used that day and their usage (on timings).
    
    """
    day_data = dict()
    for channel in channel_list:
        channel_data = list(channel_data_dict[channel])
        channel_data = [ x for x in channel_data if date in x]
        day_data[channel] = channel_data
    cleaned_day_data = dict()    
    for channel in day_data.keys():
        if len(day_data[channel]) == 0:
            continue
        else:
            cleaned_day_data[channel] = day_data[channel]
    no_of_channel_w = len(cleaned_day_data.keys()) 
#     print("No of channel working on {%s} are :"%(date),end = " ")
#     print(no_of_channel_w)       
    return cleaned_day_data    

In [493]:
# def get_hour_usage_data(channel_data_dict, date, hour_time, channel_list):
#     """
#     This method extracts the appliances that are on and their usage data for a particular hour. 
    
#     Input:
#     channel_data_dict = Dictionary of channel and their on/off data for the day.
#     date = Date for which used appliances and their usage needs to be extracted
#     hour_time = hour of the day in 24 hour format
#     channel_list = List of channels for this house
        
#     Output:
#     hourwise_data = Dictionary of appliances used that hour and their usage (on timings).
    
#     """
#     day_data = get_day_usage_data(channel_data_dict, channel_list, date)
#     # print(day_data)
#     temp_list = [date, hour_time[:2]]
#     temp_time = ' '.join(temp_list)
#     hourwise_data = dict()
#     for channel in day_data.keys():
#         temp_data = list(channel_data_dict[equip])
#         temp_data = [ x for x in temp_data if temp_time in x]
#         if len(temp_data)==0:
#             continue
#         else:
#             hourwise_data[channel] = temp_data
# #     print("No of equipments working at %s %s are :"%(Date,HourTime),end =" ")
# #     print(len(hourwise_data.keys()))       
#     return hourwise_data       

In [495]:
def get_usage_data_for_time(day_data, channel_data_dict, date, time, channel_list):
    """
    This method extracts the appliances that are on and their usage data for a particular time of the day. 
    
    Input:
    day_data = Dictionary of channel and their on/off data for the day.
    channel_data_dict = Complete dictionary of channel and their on/off data.
    date = Date for which used appliances and their usage needs to be extracted
    time = Time for which used appliances and their usage needs to be extracted
    channel_list = List of channels for this house
        
    Output:
    list = List of channels on for that time of the day.
    
    """
    temp_list = [date, time]
    temp_time = ' '.join(temp_list)
    time_data = dict()
    time_data[time] = []
    for channel in day_data.keys():
        temp_data = list(channel_data_dict[channel])
        temp_data = [ x for x in temp_data if temp_time in x]
        if len(temp_data)==0:
            continue
        else:
            time_data[channel] = temp_data
#     print("No of channels working at %s %s are :"%(date, time),end =" ")
#     print(len(time_data.keys()))       
    return list(time_data.keys())

## Marked for removal

In [10]:
def MonthDateGenerator(year ,month,days_in_the_month):
    year = year
    month = month 
    days = days_in_the_month
    day = ['01','02','03','04','05','06','07','08','09']
    for i in range(10,days+1):
        day.append(str(i))
    Dates = []
    for d in day:
        temp = [year,month,d]
        temp = '-'.join(temp)
        Dates.append(temp)
    return Dates        

In [497]:
def data_extractor(channel_list, channel_data_dict, dates, times):
    """
    This method extracts the appliances that are on and their usage data for a given date range. 
    
    Input:
    channel_list = List of channels for this house.
    channel_data_dict = Complete dictionary of channel and their on/off data.
    dates = Dates for which used appliances and their usage needs to be extracted.
    times = Times for which used appliances and their usage needs to be extracted.
    
        
    Output:
    list = List-of-lists of channels that are On for that period.
    
    """
    transactions = []
    no_of_ch = len(channel_list)
    # Iterate over dates in the date range
    for date in dates:
        # Get appliance usage data for day
        day_data = get_usage_data_for_day(channel_data_dict, channel_list, date)
        # Iterate over times in the 24 hour time range
        for time in times:
            # Get appliances on for time
            temp_list = get_usage_data_for_time(day_data, channel_data_dict, date, time, channel_list)           
            if len(temp_list)==0:
                continue
            else:
                transactions.append(temp_list)
    return transactions                

In [534]:
# def get_apriori_input_data(equipment_list,equipments_data,Dates,Time):
#     apriori_data = DataExtractor(equipment_list,equipments_data,Dates,Time)
#     return apriori_data

In [551]:
def divide_data_into_time(times, apriori_data):
    """
    This method seperates ON appliance sequences by time and Saves them in a dictionary with time slices as the key
    
    Input:
    times = List of time slices.
    apriori_data = Output of the data_extractor().
    
    Output:
    Dataframe of support and frequent itemsets.
    """
    time_sequence_dict = dict()
    for timestamp in times:
        sequence_list = []
        for data in apriori_data:
            if timestamp in data:
                sequence_list.append(data)
        time_sequence_dict[timestamp] = sequence_list
    return time_sequence_dict

In [552]:
def get_support_and_itemsets(apriori_data, minimum_support):
    """
    This method runs the apriori algorithm on the input data and returns frequent itemsets with their respective supports
    
    Input:
    apriori_data = List-of-lists of appliances On at a particular time for each day.
    minimum_support = Minimum support for getting the frequent itemsets.
    
    Output:
    Dataframe of support and frequent itemsets
    """
    te = TransactionEncoder()
    data = te.fit(apriori_data).transform(apriori_data)
    data = pd.DataFrame(data, columns = te.columns_)
    return apriori(data, min_support = minimum_support, use_colnames = True)

In [541]:
def get_channels_from_frequent_itemsets(frequent_itemsets_df):
    """
    This method returns the set of unique channels present in the support-frequent itemsets dataframe.
    This method was used to check the appliance coverage.
    
    Input:
    frequent_itemsets_df = Dataframe of support and frequent itemsets.
    
    Output:
    Set of channels present in the entire frequent itemsets
    """
    channels = set()
    for item in frequent_itemsets_df.itemsets:
        for entry in list(item):
            channels.add(entry)
    return channels

In [544]:
# def get_channels_from_rules(rules_df):
#     """
#     This method returns the set of unique channels present in the time-rules dataframe.
#     This method was used to check the appliance coverage.
    
#     Input:
#     rules_df = Dataframe of time and appliances On.
    
#     Output:
#     Set of channels present in the entire frequent itemsets
#     """
#     channels = set()
#     for rule in rules_df['consequents']:
#         for entry in rule[12:-3].replace("'",'').split(", "):
#             channels.add(entry)
#     return channels

In [543]:
def get_appliances_from_channels(list_of_channels, labels_df):
    """
    This method returns a list of appliance names given a list of channels.
    
    Input:
    list_of_channels = List of channels for which names to be extracted.
    labels_df = Dataframe of channel ids and appliance names.
    
    Output:
    List of appliance names corresponding to each channel in list_of_channels
    """
    result = []
    for item in list_of_channels:
        result.append(labels_df.set_index('Channel_id').at[item, 'Appliance'])
    return result

In [545]:
def get_all_channels_in_rules(rule_df):
    """
    This method returns the set of unique channels present in the time-rules dataframe.
    This method was used to check the appliance coverage.
    
    Input:
    rules_df = Dataframe of time and appliances On.
    
    Output:
    Set of channels present in the rules dataframe
    """
    channel_list = []
    for sequence in rule_df['consequents']:
        list_of_channels = sequence[12:-3].replace("'",'').split(", ")
        for item in list_of_channels:
            if item not in channel_list:
                channel_list.append(item)
    return channel_list

In [561]:
def get_missing_channel_list(rule_df, full_channel_set):
    """
    This method returns the set of unique channels not present in the time-rules dataframe.
    This method was used to check the appliance coverage.
    
    Input:
    rules_df = Dataframe of time and appliances On.
    full_channel_set = Set of complete channels in the house
    
    Output:
    Set of channels not present in the rules dataframe
    """
    channel_set = set()
    for sequence in rule_df['consequents']:
        list_of_channels = str(sequence)[12:-3].replace("'",'').split(", ")
        channel_set.update(list_of_channels)
    result_1 = full_channel_set - channel_set
    return result_1

### House 2 patterns -> 30 min resampled

In [256]:
labels_df_house_2, labels_dict_house_2 = get_labels("../../../../Dataset/ukdale/house_2/labels.dat")

In [547]:
labels_df_house_2.head()

Unnamed: 0,Channel_id,Appliance
0,channel_1,aggregate
1,channel_2,laptop
2,channel_3,monitor
3,channel_4,speakers
4,channel_5,server


In [449]:
path = "./Channel_On_Off_data/House_2/"
resampling_time_in_min = '30'

# Get equipment list and data from channel files
channel_list = get_channel_list(path)
channel_data = get_channel_data(path, channel_list)

# Get max and min date in the equipments data
min_date, max_date = get_min_max_dates(path, channel_list)

# Generate date and time list 
Dates = get_dates_list(min_date, max_date)
Time = get_all_times_of_day(resampling_time_in_min)

In [558]:
resampling_time_in_min = '30'
min_confidence = 0.4
min_support = 0.4
apriori_data_output_file = "./Channel_On_Off_data/House_2/new_apriori_data_1.npy"
rule_files_dir = "./Rule_Files/"
labels_file = "../../../../Dataset/ukdale/house_2/labels.dat"
recommendations_file = 'recommendations.csv'

### Get labels from the labels.dat file

In [559]:
labels_df, labels_map = get_labels(labels_file)

### Get house related data

In [525]:
# Get channel list and data from channel files
channel_list = get_channel_list(path)
channel_data = get_channel_data(path, channel_list)

# Get max and min date in the channel data
min_date, max_date = get_min_max_dates(path, channel_list)

# Generate date and time list 
Dates = get_dates_list(min_date, max_date)
Time = get_all_times_of_day(resampling_time_in_min)

### Extract appliance usage data from the channels

In [555]:
# # Generate appliance ON data for each time slice of each day and save it.
apriori_dt = data_extractor(channel_list, channel_data, Dates, Time)
np.save(apriori_data_output_file, apriori_dt)

### Generate rules for each time of the day 

In [528]:
%%time

# Generate frequent itemsets and rules one time slice at a time and save them in a dictionary. 

time_itemset_map = dict()
time_rules_map = dict()
time_channels_map_from_itemsets = dict()
time_channels_map_from_rules = dict()

time_appliance_map = divide_data_into_time(times, apriori_data)

for timestamp in time_appliance_map:
    print("Generating rules for : "+str(timestamp))
    
    # Generate frequent itemsets
    time_itemset_map[timestamp] = get_support_and_itemsets(time_appliance_map[timestamp], min_support)
    
    # Generate rules
    time_rules_map[timestamp] = association_rules(time_itemset_map[timestamp], metric="confidence", min_threshold = min_confidence)
    
    # Filter rules which starts from current time slice
    rules_df = time_rules_map[timestamp]
    time_rules_map[timestamp] = rules_df[rules_df['antecedents'] == frozenset({timestamp})]
    
    # Get channels from frequent itemsets
    time_channels_map_from_itemsets[timestamp] = get_channels_from_frequent_itemsets(time_itemset_map[timestamp])
    
    # Get channels from rules
    time_channels_map_from_rules[timestamp] = get_channels_from_rules(time_rules_map[timestamp])


### Save the rules in CSV files to a rules directory

In [529]:
for timestamp, df in time_rules_map.items():
    df.to_csv(rule_files_dir + timestamp + ".csv", header=True, index=False)

In [531]:
# rule_df_dict = dict()
# for item in Time:
#     rule_df_dict[item] = pd.read_csv("./Rule_Files_From_Hpc/" + item + ".csv")

In [None]:
### 

In [556]:
# time_channel_map = dict()
# for timestamp in Time:
#     time_channel_map[timestamp] = get_channels_from_rules(time_rules_map[timestamp])

### Generate Recommendations and save them to a dataframe

In [None]:
recommendation_list = []
for timestamp in Time:
    rule_df = time_rules_map[timestamp].sort_values(by=['confidence'], ascending=False)
    recommended_channels = get_all_channels_in_rules(rule_df[:200])
    recommendations = get_appliances_from_channels(recommended_channels, labels_df)
    recommendation_list.append(",".join(appliance for appliance in recommendations))
time_recommendation_df = pd.DataFrame({"Time" : Time, "Recommendations" : recommendation_list})

In [None]:
time_recommendation_df.to_csv(recommendations_file)

### Generate Apriori input data

In [24]:
apriori_data = get_apriori_input_data(equipment_list, equipments_data,Dates,Time)

In [532]:
len(apriori_data)

11328

In [None]:
apriori_data

In [33]:
np.save("./Channel_On_Off_data/House_2/new_apriori_data.npy", apriori_data)

### Create a time-sequence dictionary

In [40]:
time_sequence_dict = dict()
for timestamp in Time:
    sequence_list = []
    for data in apriori_data:
        if timestamp in data:
            sequence_list.append(data)
    time_sequence_dict[timestamp] = sequence_list

In [420]:
time_sequence_dict['10:30:00']

[['10:30:00'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4', 'channel_3', 'channel_2'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4', 'channel_3', 'channel_2'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4', 'channel_3', 'channel_2'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4', 'channel_3', 'channel_2'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4', 'channel_3', 'channel_2'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4', 'channel_3', 'channel_2'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4', 'channel_3', 'channel_2'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4', 'channel_3', 'channel_2'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4', 'channel_3', 'channel_2'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4'],
 ['10:30:00', 'channel_6', 'channel_5', 'channel_4', 'channel_2'],
 ['10:30:00', 'channel_6', 'channel_

## Generate rules for all the timestamps

In [91]:
%%time

h2_frequent_itemset_dict = dict()
h2_time_channels_from_frequent_itemsets_dict = dict()
h2_time_channels_from_rules_dict = dict()
h2_time_rules_dict = dict()

for timestamp in time_sequence_dict:
    # Generate frequent itemsets
    h2_frequent_itemset_dict[timestamp] = get_support_and_itemsets(time_sequence_dict[timestamp], 0.4)
    # Generate rules
    h2_time_rules_dict[timestamp] = association_rules(h2_frequent_itemset_dict[timestamp], metric="confidence", min_threshold=0.4)
    # Filter rules
    h2_time_rules_dict[timestamp] = h2_time_rules_dict[timestamp][h2_time_rules_dict[timestamp]['antecedents'] == frozenset({timestamp})]
    # Get channels from frequent itemsets
    h2_time_channels_from_frequent_itemsets_dict[timestamp] = get_channels_from_frequent_itemsets(h2_frequent_itemset_dict[timestamp])
    # Get channels from rules
    h2_time_channels_from_rules_dict[timestamp] = get_channels_from_rules(h2_time_rules_dict[timestamp])


CPU times: user 9min 22s, sys: 13min 53s, total: 23min 15s
Wall time: 28min 36s


### Save time-rules Dataframes to CSV files

In [141]:
for timestamp, df in h2_time_rules_dict.items():
    df.to_csv("./Rule_Files/" + timestamp + ".csv", header=True, index=False)

### Read time-rule CSV files

In [163]:
rule_df_dict = dict()
for item in Time:
    rule_df_dict[item] = pd.read_csv("./Rule_Files_From_Hpc/" + item + ".csv")

In [405]:
timestamp = '10:30:00'

In [346]:
rule_df_dict[timestamp].sort_values(by=['confidence'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
266,frozenset({'10:30:00'}),"frozenset({'channel_6', 'channel_4', 'channel_...",1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
9,frozenset({'10:30:00'}),frozenset({'channel_5'}),1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
10,frozenset({'10:30:00'}),frozenset({'channel_6'}),1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
75,frozenset({'10:30:00'}),"frozenset({'channel_5', 'channel_6'})",1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
70,frozenset({'10:30:00'}),"frozenset({'channel_4', 'channel_5'})",1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
639,frozenset({'10:30:00'}),"frozenset({'channel_8', 'channel_4', 'channel_...",1.0,0.411017,0.411017,0.411017,1.0,0.0,1.0
638,frozenset({'10:30:00'}),"frozenset({'channel_4', 'channel_9', 'channel_...",1.0,0.411017,0.411017,0.411017,1.0,0.0,1.0
636,frozenset({'10:30:00'}),"frozenset({'channel_4', 'channel_9', 'channel_...",1.0,0.411017,0.411017,0.411017,1.0,0.0,1.0
642,frozenset({'10:30:00'}),"frozenset({'channel_8', 'channel_9', 'channel_...",1.0,0.411017,0.411017,0.411017,1.0,0.0,1.0


### Get list of all channels for a timestamp

In [209]:
time_channel_dict = dict()
for timestamp in Time:
    time_channel_dict[timestamp] = get_channels_from_rules(rule_df_dict[timestamp])

In [409]:
time_channel_dict['10:30:00']

{'channel_10',
 'channel_12',
 'channel_13',
 'channel_14',
 'channel_17',
 'channel_18',
 'channel_2',
 'channel_3',
 'channel_4',
 'channel_5',
 'channel_6',
 'channel_7',
 'channel_8',
 'channel_9'}

In [410]:
len(time_channel_dict['10:30:00'])

14

### All channels from all the rules

In [212]:
channel_set = set()
for item in time_channel_dict.keys():
    for channel in time_channel_dict[item]:
        channel_set.add(channel)

In [284]:
channel_set

{'channel_10',
 'channel_11',
 'channel_12',
 'channel_13',
 'channel_14',
 'channel_17',
 'channel_18',
 'channel_2',
 'channel_3',
 'channel_4',
 'channel_5',
 'channel_6',
 'channel_7',
 'channel_8',
 'channel_9'}

In [282]:
len(channel_set)

15

### Missing channels are : [15 - Microwave, 16 - Toaster, 19 - Cooker]

### Get channels list for house 2

In [237]:
h2_channel_set = set()
for i in range(2, 20):
    h2_channel_set.add('channel_'+str(i))

In [238]:
len(h2_channel_set)

18

## Generate recommendations file

In [394]:
recommendation_list = []
for timestamp in Time:
    rule_df = rule_df_dict[timestamp].sort_values(by=['confidence'], ascending=False)
    recommended_channels = get_all_channels_in_rules(rule_df[:200])
    recommendations = get_appliances_from_channels(recommended_channels, labels_df_house_2)
    recommendation_list.append(",".join(appliance for appliance in recommendations))
time_recommendation_df = pd.DataFrame({"Time" : Time, "Recommendations" : recommendation_list})


In [395]:
# recommended_channels

In [396]:
time_recommendation_df.head()

Unnamed: 0,Time,Recommendations
0,00:00:00,"speakers,server,router,rice_cooker,kettle,runn..."
1,00:30:00,"speakers,router,server,kettle,running_machine,..."
2,01:00:00,"speakers,router,server,kettle,running_machine,..."
3,01:30:00,"speakers,router,server,kettle,running_machine,..."
4,02:00:00,"speakers,router,server,kettle,rice_cooker,runn..."


In [404]:
time_recommendation_df.to_csv('recommendations.csv')

In [441]:
## Reading Janhavi's reccommendations file

In [421]:
rule_mining_df = pd.read_csv('recommendations_15.csv')

In [437]:
recommendation_list_1 = []
for recommendations in rule_mining_df['Recommendations']:
    rec = ",".join(channel[2:-2] for channel in recommendations[1:-1].split(", "))
    recommendations = get_appliances_from_channels(rec.split(","), labels_df_house_2)
    recommendation_list_1.append(",".join(appliance for appliance in recommendations))
time_recommendation_df_1 = pd.DataFrame({"Time" : Time, "Recommendations" : recommendation_list_1})

In [438]:
recommendations

['router',
 'server',
 'speakers',
 'kettle',
 'running_machine',
 'rice_cooker',
 'fridge',
 'washing_machine',
 'modem',
 'playstation',
 'dish_washer',
 'server_hdd']

In [439]:
time_recommendation_df_1

Unnamed: 0,Time,Recommendations
0,00:00:00,"speakers,server,router,running_machine,kettle,..."
1,00:30:00,"router,server,speakers,kettle,running_machine,..."
2,01:00:00,"router,speakers,server,running_machine,kettle,..."
3,01:30:00,"router,server,speakers,kettle,running_machine,..."
4,02:00:00,"speakers,server,router,kettle,rice_cooker,runn..."
5,02:30:00,"speakers,server,router,kettle,running_machine,..."
6,03:00:00,"server,speakers,router,kettle,running_machine,..."
7,03:30:00,"router,speakers,server,running_machine,kettle,..."
8,04:00:00,"router,server,speakers,running_machine,rice_co..."
9,04:30:00,"router,server,speakers,running_machine,kettle,..."


In [440]:
time_recommendation_df_1.to_csv('time_recommendation_df_1.csv')

### Rules with min_confidence of 0.8

In [84]:
%%time
rules_003000_1 = association_rules(fi_003000_1, metric="confidence", min_threshold=0.8)

CPU times: user 6.19 s, sys: 250 ms, total: 6.44 s
Wall time: 5.42 s


In [85]:
rules_003000_1.shape

(478035, 9)

In [86]:
rules_003000_1[rules_003000_1['antecedents'] == frozenset({'00:30:00'})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7,(00:30:00),(channel_4),1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
8,(00:30:00),(channel_5),1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
11,(00:30:00),(channel_6),1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
294,(00:30:00),"(channel_4, channel_5)",1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
301,(00:30:00),"(channel_4, channel_6)",1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
315,(00:30:00),"(channel_6, channel_5)",1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
2876,(00:30:00),"(channel_4, channel_6, channel_5)",1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0


In [87]:
ch1 = get_channels_from_rules(rules_003000_1)

In [88]:
len(ch1)

12

In [89]:
ch1

{'00:30:00',
 'channel_10',
 'channel_12',
 'channel_13',
 'channel_14',
 'channel_17',
 'channel_18',
 'channel_4',
 'channel_5',
 'channel_6',
 'channel_8',
 'channel_9'}

In [44]:
rules_000000.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(00:00:00),(channel_10),1.0,0.631356,0.631356,0.631356,1.0,0.0,1.0
1,(channel_10),(00:00:00),0.631356,1.0,0.631356,1.0,1.0,0.0,inf
2,(00:00:00),(channel_12),1.0,0.508475,0.508475,0.508475,1.0,0.0,1.0
3,(channel_12),(00:00:00),0.508475,1.0,0.508475,1.0,1.0,0.0,inf
4,(channel_13),(00:00:00),0.483051,1.0,0.483051,1.0,1.0,0.0,inf


In [49]:
rules_000000[rules_000000['antecedents'] == frozenset({'00:00:00'})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(00:00:00),(channel_10),1.0,0.631356,0.631356,0.631356,1.0,0.0,1.0
2,(00:00:00),(channel_12),1.0,0.508475,0.508475,0.508475,1.0,0.0,1.0
5,(00:00:00),(channel_13),1.0,0.483051,0.483051,0.483051,1.0,0.0,1.0
6,(00:00:00),(channel_14),1.0,0.508475,0.508475,0.508475,1.0,0.0,1.0
9,(00:00:00),(channel_17),1.0,0.508475,0.508475,0.508475,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
5621178,(00:00:00),"(channel_18, channel_6, channel_8, channel_17,...",1.0,0.483051,0.483051,0.483051,1.0,0.0,1.0
5772357,(00:00:00),"(channel_18, channel_6, channel_14, channel_8,...",1.0,0.508475,0.508475,0.508475,1.0,0.0,1.0
5976252,(00:00:00),"(channel_18, channel_6, channel_14, channel_8,...",1.0,0.483051,0.483051,0.483051,1.0,0.0,1.0
6252994,(00:00:00),"(channel_18, channel_6, channel_14, channel_8,...",1.0,0.483051,0.483051,0.483051,1.0,0.0,1.0


In [149]:
rules_000000[rules_000000['antecedents'] == frozenset({'00:00:00'})].sort_values(by=['confidence'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
521,(00:00:00),"(channel_4, channel_5)",1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
528,(00:00:00),"(channel_4, channel_6)",1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
5599,(00:00:00),"(channel_4, channel_6, channel_5)",1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
551,(00:00:00),"(channel_6, channel_5)",1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
15,(00:00:00),(channel_4),1.0,0.847458,0.847458,0.847458,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
1070319,(00:00:00),"(channel_6, channel_8, channel_7, channel_9, c...",1.0,0.449153,0.449153,0.449153,1.0,0.0,1.0
5865,(00:00:00),"(channel_7, channel_9, channel_8)",1.0,0.449153,0.449153,0.449153,1.0,0.0,1.0
224,(00:00:00),"(channel_10, channel_7)",1.0,0.449153,0.449153,0.449153,1.0,0.0,1.0
5823,(00:00:00),"(channel_6, channel_7, channel_8)",1.0,0.449153,0.449153,0.449153,1.0,0.0,1.0


In [91]:
# apriori_data = np.load("./Channel_On_Off_data/House_2/" + "apriori_data.npy", allow_pickle=True)
house_2_30_min_resampled_patterns = get_support_and_itemsets(apriori_data, 0.02)
# house_2_30_min_resampled_patterns.to_csv('Pattern_Files/house_2_30_min_resampled_patterns.csv')

In [33]:
# house_2_30_min_resampled_patterns.sort_values(by=['support'], ascending=False)

Unnamed: 0,support,itemsets
128,0.021039,(21:00:00 channel_6)
126,0.021039,(21:00:00 channel_4)
274,0.021039,"(21:30:00 channel_4, 21:30:00 channel_6)"
273,0.021039,"(21:30:00 channel_5, 21:30:00 channel_4)"
272,0.021039,"(21:00:00 channel_5, 21:00:00 channel_6)"
...,...,...
76,0.020519,(12:30:00 channel_5)
313,0.020519,"(12:30:00 channel_4, 12:30:00 channel_6, 12:30..."
220,0.020519,"(12:30:00 channel_4, 12:30:00 channel_6)"
219,0.020519,"(12:30:00 channel_4, 12:30:00 channel_5)"


In [92]:
house_2_30_min_resampled_patterns.sort_values(by=['support'], ascending=False)

Unnamed: 0,support,itemsets
59,0.847546,(channel_5)
58,0.847546,(channel_4)
168,0.847546,"(channel_4, channel_5)"
169,0.847458,"(channel_4, channel_6)"
60,0.847458,(channel_6)
...,...,...
29,0.020833,(14:30:00)
28,0.020833,(14:00:00)
27,0.020833,(13:30:00)
26,0.020833,(13:00:00)


### Checking for channels 

In [129]:
channels = set()
for item in house_2_30_min_resampled_patterns.itemsets:
    for entry in list(item):
        channels.add(entry)

In [131]:
# channels

### Generating rules using frequent itemsets

In [94]:
rules = association_rules(house_2_30_min_resampled_patterns, metric="confidence", min_threshold=0.4)

In [95]:
rules.sort_values(by=['confidence'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(channel_11),(channel_10),0.121204,0.631091,0.121204,1.000000,1.584557,0.044713,inf
3451371,"(channel_7, channel_2, channel_4, channel_12, ...",(channel_14),0.036635,0.509004,0.036635,1.000000,1.964620,0.017988,inf
3451208,"(channel_9, channel_2, channel_4, channel_17, ...","(channel_14, channel_8, channel_12, channel_5)",0.063118,0.508739,0.063118,1.000000,1.965643,0.031007,inf
3451212,"(channel_9, channel_2, channel_17, channel_12,...","(channel_14, channel_4, channel_8, channel_5)",0.063118,0.508739,0.063118,1.000000,1.965643,0.031007,inf
3451216,"(channel_9, channel_2, channel_17, channel_11,...","(channel_14, channel_4, channel_12, channel_8)",0.063118,0.508739,0.063118,1.000000,1.965643,0.031007,inf
...,...,...,...,...,...,...,...,...,...
4514018,"(channel_10, channel_9, channel_11)","(channel_4, channel_17, channel_6, channel_3, ...",0.121028,0.186882,0.048464,0.400438,2.142729,0.025846,1.356186
4514020,"(channel_6, channel_11, channel_9)","(channel_4, channel_17, channel_10, channel_3,...",0.121028,0.186706,0.048464,0.400438,2.144755,0.025867,1.356480
4514022,"(channel_11, channel_9, channel_5)","(channel_4, channel_17, channel_10, channel_6,...",0.121028,0.186706,0.048464,0.400438,2.144755,0.025867,1.356480
4514024,"(channel_11, channel_9, channel_8)","(channel_4, channel_17, channel_10, channel_6,...",0.121028,0.186706,0.048464,0.400438,2.144755,0.025867,1.356480


### House 2 patterns -> 1 Hour resampled

In [50]:
apriori_data = np.load("./Channel_On_Off_data/House_2/One_Hour/" + "apriori_data.npy", allow_pickle=True)
house_2_1_hour_resampled_patterns = get_support_and_itemsets(apriori_data, 0.02)
house_2_1_hour_resampled_patterns.to_csv('house_2_1_hour_resampled_patterns.csv')

In [117]:
# House 2 patterns -> 1 hour resampling
house_2_1_hour_resampled_patterns.sort_values(by=['support'], ascending=False)[:15]

Unnamed: 0,support,itemsets
288,0.04204,(21:00:00 channel_4)
6367,0.04204,"(21:00:00 channel_4, 21:00:00 channel_5, 21:00..."
1824,0.04204,"(21:00:00 channel_4, 21:00:00 channel_6)"
1823,0.04204,"(21:00:00 channel_4, 21:00:00 channel_5)"
1828,0.04204,"(21:00:00 channel_5, 21:00:00 channel_6)"
289,0.04204,(21:00:00 channel_5)
290,0.04204,(21:00:00 channel_6)
1899,0.041831,"(22:00:00 channel_5, 22:00:00 channel_6)"
1515,0.041831,"(17:00:00 channel_6, 17:00:00 channel_5)"
3393,0.041831,"(07:00:00 channel_5, 07:00:00 channel_6, 07:00..."


### House 3 patterns -> 30 min resampled

In [114]:
apriori_data = np.load("./Channel_On_Off_data/House_3/" + "apriori_data.npy", allow_pickle=True)
house_3_30_min_resampled_patterns = get_support_and_itemsets(apriori_data, 0.02)
house_3_30_min_resampled_patterns.to_csv('house_3_30_min_resampled_patterns.csv')

In [116]:
house_3_30_min_resampled_patterns.sort_values(by=['support'], ascending=False)[15:30]

Unnamed: 0,support,itemsets
36,0.020928,(14:00:00 channel_3)
35,0.020928,(13:30:00 channel_3)
34,0.020928,(13:00:00 channel_3)
33,0.020928,(12:30:00 channel_3)
41,0.020928,(16:30:00 channel_3)
40,0.020928,(16:00:00 channel_3)
45,0.020928,(18:30:00 channel_3)
31,0.020928,(11:30:00 channel_3)
55,0.020928,(22:00:00 channel_2)
57,0.020928,(22:30:00 channel_2)


In [120]:
house_3_30_min_resampled_patterns.shape

(81, 2)

### House 3 patterns -> 1 Hour resampled

In [105]:
house_3_1_hour_resampled_patterns.sort_values(by=['support'], ascending=False)[:30]

Unnamed: 0,support,itemsets
44,0.04382,(20:00:00 channel_3)
50,0.042697,(23:00:00 channel_3)
42,0.042697,(19:00:00 channel_3)
46,0.042697,(21:00:00 channel_3)
48,0.042697,(22:00:00 channel_3)
77,0.041573,"(23:00:00 channel_3, 23:00:00 channel_2)"
32,0.041573,(14:00:00 channel_3)
1,0.041573,(00:00:00 channel_3)
30,0.041573,(13:00:00 channel_3)
40,0.041573,(18:00:00 channel_3)


Unnamed: 0,support,itemsets
0,0.020831,(00:00:00 channel_4)
1,0.020831,(00:00:00 channel_5)
2,0.020831,(00:00:00 channel_6)
3,0.020831,(00:30:00 channel_4)
4,0.020831,(00:30:00 channel_5)
...,...,...
331,0.021039,"(21:30:00 channel_5, 21:30:00 channel_6, 21:30..."
332,0.020935,"(22:00:00 channel_5, 22:00:00 channel_6, 22:00..."
333,0.020831,"(22:30:00 channel_5, 22:30:00 channel_6, 22:30..."
334,0.020831,"(23:00:00 channel_5, 23:00:00 channel_6, 23:00..."


In [65]:
pat_3[:10]

Unnamed: 0,support,itemsets
0,0.600699,(channel_14)
1,0.600507,(channel_18)
2,0.440683,(channel_2)
3,0.388178,(channel_3)
4,0.999357,(channel_4)
5,0.999983,(channel_5)
6,0.999722,(channel_6)
7,0.528103,(channel_7)
8,0.600507,"(channel_18, channel_14)"
9,0.600699,"(channel_4, channel_14)"


In [180]:
# patterns = get_support_and_itemsets(apriori_data, 0.001) #house3

In [181]:
# patterns

Unnamed: 0,support,itemsets
0,0.003501,(00:00:00 channel_3)
1,0.001325,(00:00:00 channel_4)
2,0.003501,(00:05:00 channel_3)
3,0.001325,(00:05:00 channel_4)
4,0.003501,(00:10:00 channel_3)
...,...,...
537,0.001041,"(22:50:00 channel_3, 22:50:00 channel_5)"
538,0.001041,"(23:40:00 channel_3, 23:40:00 channel_4)"
539,0.001136,"(23:45:00 channel_4, 23:45:00 channel_3)"
540,0.001325,"(23:50:00 channel_4, 23:50:00 channel_3)"
