In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt

%matplotlib inline

<h1>Exploration of the Toy Dataset</h1>

In [2]:
df = pd.read_csv("toy_dataset.txt", delimiter=';')
form = '%Y-%d-%m %H:%M'
df['date'] = pd.to_datetime(df['date'], format=form)
df = df.set_index('date')

In [3]:
df.head()

Unnamed: 0_level_0,activity
date,Unnamed: 1_level_1
2017-11-29 07:51:00,kitchen
2017-11-29 07:59:00,breakfast
2017-11-29 08:02:00,coffee
2017-11-29 12:42:00,kitchen
2017-11-29 12:46:00,lunch


In [4]:
import math
def extract_transactions(df, Tep = 30) :
    """
    Tep: in minutes
    """
    
    tep_days = int(math.floor(Tep/60))
    tep_minutes = Tep - tep_days*60
    start_time = df.index[0] #Start time of the dataset

    df['trans_id'] = 0
    # Tep = 30mn, Max time for an activity (episode occurrence)
    current_start_time = start_time
    current_trans_id = 0

    transactions = []
    while True:
        current_trans_id += 1
        current_end_time = current_start_time + dt.timedelta(hours=tep_days, minutes=tep_minutes)
        transactions.append(list(df.loc[(df.index >= current_start_time) & (df.index < current_end_time)].activity.values))
        df.loc[(df.index >= current_start_time) & (df.index < current_end_time), 'trans_id'] = current_trans_id

        if len(df.loc[df.index > current_end_time]) > 0 :
            current_start_time =  df.loc[df.index > current_end_time].index[0]
        else :
            break
    
    return df, transactions

df, transactions = extract_transactions(df, Tep=60)
print(transactions)
df

[['kitchen', 'breakfast', 'coffee'], ['kitchen', 'lunch', 'coffee'], ['kitchen', 'breakfast', 'coffee'], ['kitchen', 'lunch', 'coffee'], ['kitchen', 'breakfast', 'coffee'], ['kitchen', 'lunch'], ['kitchen', 'breakfast', 'coffee'], ['kitchen', 'lunch', 'coffee']]


Unnamed: 0_level_0,activity,trans_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-11-29 07:51:00,kitchen,1
2017-11-29 07:59:00,breakfast,1
2017-11-29 08:02:00,coffee,1
2017-11-29 12:42:00,kitchen,2
2017-11-29 12:46:00,lunch,2
2017-11-29 13:06:00,coffee,2
2017-11-30 07:43:00,kitchen,3
2017-11-30 07:57:00,breakfast,3
2017-11-30 08:02:00,coffee,3
2017-11-30 12:02:00,kitchen,4


In [5]:
import fp_growth

episodes_dict = fp_growth.find_frequent_patterns(transactions, 2)
print(episodes_dict)

{('breakfast', 'coffee'): 4, ('breakfast', 'coffee', 'kitchen'): 4, ('coffee', 'kitchen', 'lunch'): 3, ('kitchen', 'lunch'): 4, ('coffee', 'kitchen'): 7, ('kitchen',): 8}


In [49]:
def find_occurrences(df, occurrences_df, episode):
    occ_trans_id = list(df.trans_id.unique())
    for item in episode:
        occ_trans_id = list(set(occ_trans_id).intersection(list(df[df.activity == item].trans_id.unique())))
    
    for id in occ_trans_id:
        start_time = df[(df.trans_id == id) & (df.activity == episode[0])].index[0] #First event of the episode
        end_time = df[(df.trans_id == id) & (df.activity == episode[-1])].index[0] #Last event of the episode
        occurrences_df.loc[len(occurrences_df)+1] = [episode, start_time, end_time]
        
    
    return occ_trans_id

<h1>Candidate study step</h1>

In [50]:
#Dataframe for every episodes occurrences, each row is an occurrence for a specific episode
occurrences_df = pd.DataFrame(columns = ['episode', 'start_time', 'end_time'])

for episode in episodes_dict.keys() :
    #print("Episode :", episode, "Support:", episodes_dict[episode])
    find_occurrences(df, occurrences_df, episode)

#Candidates periods for GMM (only T=24hours for now)
candidate_periods = [dt.timedelta(seconds=3600*2)]


#INPUTS
# Occurences_df : a dataframe with all episodes occurences sorted by start_time
# deltaTmax : If there is a gap > deltaTmax between two occurrences of an episode, 
#            the occurrences before and after the gap are split (different validity intervals). [3 times the candidate period]
# support_treshold : minimal support
# std_max : maximal standard deviation considered as normal
# accuracy_min : Minimal accuracy for a periodicity description to be considered as
#               interesting, and thus factorized.


for episode in episodes_dict.keys() :
    occ_df = occurrences_df.loc[occurrences_df.episode == episode]
    occ_df = occ_df.sort_values(["start_time", "end_time"], ascending=True)
    
    #Compute time interval since the last occurence
    occurrences_df.loc[occurrences_df.episode == episode, "time_since_last_occ"] = occ_df['start_time'] - occ_df['end_time'].shift(1)
    
    #First row 'time_since_last_occ' is NaT so we replace by a duration of '0'
    occurrences_df.fillna(0, inplace=True)
    
    for candidate_period in candidate_periods:
        deltaTmax = 3*candidate_period #mentionned in the INPUTS
        
        occ_df = occurrences_df.loc[occurrences_df.episode == episode]
        #Spit the occurrences in groups
        group_gap_bounds = [dt.datetime.min, dt.datetime.max]
        
        # [min_time, insertion of groups bound, max_time]
        group_gap_bounds[1:1] = list(occ_df[occ_df.time_since_last_occ > deltaTmax]['start_time'])
        
        print(group_gap_bounds)
        for group_index in range(len(group_gap_bounds)-1):
            print(group_index)
            occurrences_df.loc[(occurrences_df.episode == episode)
                               & (occurrences_df.start_time >= group_gap_bounds[group_index])
                               & (occurrences_df.start_time < group_gap_bounds[group_index+1]), "group_id"] = group_index 
        
    

occurrences_df.fillna(0, inplace=True)
occurrences_df.sort_values(['episode', 'start_time'], ascending=True)

ValueError: to assemble mappings requires at least that [year, month, day] be specified: [day,month,year] is missing