<a href="https://colab.research.google.com/github/neilgautam/APRIORI-ASSOCIATION_RULE_LEARNING-/blob/master/APRIORI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pandas as pd
import datetime
from datetime import timedelta, date
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [2]:
def equipment_loader(file_path,equipments):
    """
    This method loads all appliances
    Input: 
        file_path = path of the channel on/off data for house 2 
        equipments = List of all appliances
    Output: 
        equip_dict = appliance data
    """
    equip_dict = {}
    for equipment in equipments:
        print( file_path+equipment)
        equip_data = np.load(file_path+equipment)
        equipment = equipment[:-4]
        equip_data = list(equip_data)
        equip_data = [ x for x in equip_data if x!= '0']
        equip_dict[equipment] = list(equip_data)
    return equip_dict

In [3]:
def DayDataExtraction(Data,equipment_list,Date):
    """
    This method extracts the appliances that are on and their usage data for a particular day. 
    
    Input:
    channel_data_dict = Dictionary of channel and their on/off data.
    channel_list = List of channels for this house
    date = Date for which used appliances and their usage needs to be extracted
    
    Output:
    cleaned_day_data = Dictionary of appliances used that day and their usage (on timings).
    
    """
    day_data = {}
    for equipment in equipment_list:
        equipment = equipment[:-4]
        e_data = list(Data[equipment])
        e_data = [ x for x in e_data if Date in x]
        day_data[equipment] = e_data
    cleaned_day_data = {}    
    for equipment in day_data.keys():
        if len(day_data[equipment])==0:
            continue
        else:
            cleaned_day_data[equipment] = day_data[equipment]
    no_of_equipment_w = len(cleaned_day_data.keys()) 
    return cleaned_day_data    

In [4]:
def TimeDataExtraction(day_,Data,Date,Time,equipment_list):
    """
    This method extracts the appliances that are on and their usage data for a particular time of the day. 
    
    Input:
    day_data = Dictionary of channel and their on/off data for the day.
    channel_data_dict = Complete dictionary of channel and their on/off data.
    date = Date for which used appliances and their usage needs to be extracted
    time = Time for which used appliances and their usage needs to be extracted
    channel_list = List of channels for this house
        
    Output:
    list = List of channels on for that time of the day.
    
    """
    day_ = day_
    temp_list = [Date,Time]
    temp_time = ' '.join(temp_list)
    _data = {}
    for equip in day_.keys():
        temp_data = list(Data[equip])
        temp_data = [ x for x in temp_data if temp_time in x]
        if len(temp_data)==0:
            continue
        else:
            _data[Time+' '+equip] = temp_data
    return list(_data.keys())

In [5]:
def DataExtractor(equipment_list,equipments_data,Dates,Time):
    """
    This method extracts the appliances that are on and their usage data for a given date range. 
    
    Input:
    channel_list = List of channels for this house.
    channel_data_dict = Complete dictionary of channel and their on/off data.
    dates = Dates for which used appliances and their usage needs to be extracted.
    times = Times for which used appliances and their usage needs to be extracted.
    
        
    Output:
    list = List-of-lists of channels that are On for that period.
    
    """
    date_transcations = []
    no_of_eq = len(equipment_list)
    for date in Dates:
        transactions = []
        day_ = DayDataExtraction(equipments_data,equipment_list,date)
        for time in Time:
            temp_list = TimeDataExtraction(day_,equipments_data,date,time,equipment_list)
            if len(temp_list)==0:
                continue
            else:
                transactions.append(temp_list)
        if len(transactions) == 0:
            continue
        else:
            date_transcations.append(transactions)
    return date_transcations                

In [6]:
def get_min_max_dates(path_to_resampled_channel_data, channel_list):
    """
    This method finds the range of dates in which all the appliances were recorded //
    and returns a min and a max date for given appliances. 
    
    Input:
    path_to_resampled_channel_data = Path to the resampled data directory
    number_of_channels = Total number of channels in the house
    
        
    Output:
    min_date = The earliest date on which an appliance usage was recorded.
    max_date = The latest date on which an appliance usage was recorded.
    """
    min_date = datetime.datetime.max.date()
    max_date = datetime.datetime.min.date()

    for i in reversed(range(2, len(channel_list) + 1)):
        cd = np.load(path_to_resampled_channel_data + "channel_"+str(i)+".npy")
        for item in cd:
            if(item != '0'):
                datetime_obj = datetime.datetime.strptime(item, "%Y-%m-%d %H:%M:%S")
                temp_date = datetime_obj.date()
                if(temp_date < min_date):
                    min_date = temp_date
                break;

        for item in reversed(cd):
            if(item != '0'):
                datetime_obj = datetime.datetime.strptime(item, "%Y-%m-%d %H:%M:%S")
                temp_date = datetime_obj.date()
                if(temp_date > max_date):
                    max_date = temp_date
                break;
    return min_date, max_date

In [7]:
def get_dates_list(min_date, max_date):
    """
    This method finds the dates between a given set of dates.
    
    Input:
    date1 = Start date
    date2 = End date
    
    Output:
    List of date strings in between these dates
    """
    Dates = []
    start_dt = min_date
    end_dt = max_date
    for dt in daterange(start_dt, end_dt):
        Dates.append(dt.strftime("%Y-%m-%d"))
    return Dates

In [8]:
def daterange(date1, date2):
    """
    This method finds the dates between a given set of dates.
    
    Input:
    date1 = Start date
    date2 = End date
    
    Output:
    List of dates in between these dates
    """
    for n in range(int ((date2 - date1).days)+1):
        yield date1 + timedelta(n)

In [9]:
def get_equipment_list(path):
    """
    This method gets the list of channels from the house directory.
    
    Input:
    path = path to the house directory.
    
    Output:
    equipment_list = list of all the channels except channel_1 (mains).
    """
    equipment_list = []
    for item in os.listdir(path):
        if 'channel_' in item and item != "channel_1.dat":
            equipment_list.append(item)
    return equipment_list

In [10]:
def get_equipment_data(path, equipment_list):
    """
    This method loads channel on/off data from .npy files 
    generated by running 'Resampling_and_generating_appliance_on_off_data' file 
    
    Input:
    path = Path to the .npy files for all the channels
    equipment_list = List of channels for the house being analyzed
    
    Output:
    equipments_data = Dictionary of equipment and the datetime data when the equipment was turned on
    """
    equipments_data = equipment_loader(path, equipment_list)
    return equipments_data

In [11]:
def get_all_time_of_day(interval):
    """
    This method generates a list of times of a day seperated by specified interval.
    
    Input:
    interval = The gap between two neighboring time slots
    
    Output:
    Time = List of times seperated by specified interval
    """
    hour = ['00','01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23']
    minute = ['00', str(interval)]
    second = '00'
    Time = []
    for hr in hour:
        for min in minute:
            temp = [hr, min, second]
            temp = ':'.join(temp)
            Time.append(temp)
    return Time

In [12]:
def get_apriori_input_data(equipment_list,equipments_data,Dates,Time):
    """
        This method generates apriori data and writes it into a file
        Input:
            equipment_list = List of all appliances
            equipments_data = Appliance data
            Dates = List of dates
            Time = List of time
        Output:
            apriori_data = List of the output data on processing with apriori algorithm
    """
    apriori_data = DataExtractor(equipment_list,equipments_data,Dates,Time)
    listToStr = ' '.join([str(elem) for elem in apriori_data]) 
    f = open("apriori_input_data.txt", "w")
    f.write(listToStr)
    f.close()
    return apriori_data

In [13]:
def get_support_and_pattern(apriori_data, minimum_support):
    """
    This method runs the apriori algorithm on the input data and returns frequent itemsets with their respective supports
    
    Input:
    apriori_data = List-of-lists of appliances On at a particular time for each day.
    minimum_support = Minimum support for getting the frequent itemsets.
    
    Output:
    Dataframe of support and frequent itemsets
    """
    te = TransactionEncoder()
    data = te.fit(apriori_data).transform(apriori_data)
    data = pd.DataFrame(data, columns = te.columns_)
    return apriori(data, min_support = minimum_support, use_colnames = True)

### Get equipment list and data from channel files

In [14]:
path = "../Utils/Channel_On_Off_data/House_2/"
interval = 30
apriori_data_output_file = './apriori_data.npy'
output_processed_data_file = './sequnce_values_for_trulegrowth.txt'

In [15]:
equipment_list = get_equipment_list(path)
equipments_data = get_equipment_data(path, equipment_list)

../Utils/Channel_On_Off_data/House_2/channel_6.npy
../Utils/Channel_On_Off_data/House_2/channel_7.npy
../Utils/Channel_On_Off_data/House_2/channel_5.npy
../Utils/Channel_On_Off_data/House_2/channel_4.npy
../Utils/Channel_On_Off_data/House_2/channel_3.npy
../Utils/Channel_On_Off_data/House_2/channel_2.npy
../Utils/Channel_On_Off_data/House_2/channel_11.npy
../Utils/Channel_On_Off_data/House_2/channel_10.npy
../Utils/Channel_On_Off_data/House_2/channel_12.npy
../Utils/Channel_On_Off_data/House_2/channel_13.npy
../Utils/Channel_On_Off_data/House_2/channel_17.npy
../Utils/Channel_On_Off_data/House_2/channel_16.npy
../Utils/Channel_On_Off_data/House_2/channel_14.npy
../Utils/Channel_On_Off_data/House_2/channel_15.npy
../Utils/Channel_On_Off_data/House_2/channel_18.npy
../Utils/Channel_On_Off_data/House_2/channel_19.npy
../Utils/Channel_On_Off_data/House_2/channel_9.npy
../Utils/Channel_On_Off_data/House_2/channel_8.npy


### Get list of Dates and list of times of day

In [16]:
# Get max and min date in the channel data
min_date, max_date = get_min_max_dates(path, equipment_list)
# Get Dates and Time in the min_date and max_date range
Dates = get_dates_list(min_date, max_date)
Time = get_all_time_of_day(interval)

### Get apriori input data and retrive patterns using apriori algorithm

In [17]:
apriori_data = get_apriori_input_data(equipment_list, equipments_data, Dates, Time)
np.save(apriori_data_output_file, apriori_data)

### Process apriori data to generate input in a format renderable to Sequential Rule Mining algorithm

In [18]:
"""
   The input to Sequential Rule Mining algorithm should be in the form of:
   1 -1 1 2 -1 3 1 -1 -2
   3 -1 3 5 -1 3 4 -1 -2

   where, -2 indicates the end of sequence/day
          -1 indicates the end of an itemset
          itemset = data at a particular time

   Initial processing: 
    1. Make the first itemset of the sequence as 0(default value) as we require appliance:time instances
    2. Getting time and channel data as string
        Our apripori data looks like - 00:00:00 channel_8, so after processing it will look like 0000008

   Creating sequences:
    For example, a typical sequence for a day looks like:
    00:00:00 channel_8, channel_10
    01:30:00 channel_8, channel_9, and so on

    After processing, the above data will look like:
    0 -1 0000008 0000008 -1 0130008 0130009 -1 ... -1 -2

   Additional processing:
    Apriori channel data at a particular time has been processed so as to avoid data loss on running TRuleGrowth algorithm
    TRuleGrowth omits the 0s at the beginning of the string
    Hence, we have replced initial 0's with 9's
    For example, 0300009 will be converted to 9300009

   Exceptional case:

    Problem: Considering output of above processing, 09:30:00 gets converted to 993000 which can be interpreted as 0030000 on the              output end. This creates an anomoly.

    Solution: If a 0 is followed by 9, replace it with 99 instead of a single 9
              So, 09:30:00 will be converted to 9993000 and will be interpreted as 093000 on the output end.
"""
final_string = ""
for date_data in apriori_data:
    # Initial processing 1
    final_string += "0 -1 "
    for time_data in date_data:
        for channel_data in time_data:
            # Initial processing 2
            time_str = channel_data.replace(':', '').replace(' ', '').replace('channel_', '')
            # Exceptional case
            if(time_str.startswith('090000') or time_str.startswith('093000')):
                time_str = time_str.replace("0", "99", 1)
            # Additional processing
            else:
                i = 0
                while(time_str[i] == '0'):
                    i = i + 1
                time_str = time_str.replace("0", "9", i)
            final_string += time_str + " "
        final_string += "-1 "
    final_string += "-2\n"
# Add all sequences in a txt file to be provided as an input to the Sequential Rule Mining algorithm
f = open(output_processed_data_file, "w")
f.write(final_string)
f.close()