# Distill Example

## Imports ##

In [2]:
import json
import distill
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import os

## Data Setup ##

In [32]:
with open('./tests/data/sample_data.json') as json_file:
    raw_data = json.load(json_file)

data = {}
for log in raw_data:
    data[distill.getUUID(log)] = log

# Convert clientTime to specified type
date_type = "datetime"
for uid in data:
    log = data[uid]
    client_time = log['clientTime']
    if date_type == "integer":
        log['clientTime'] = distill.epoch_to_datetime(client_time)
    elif date_type == "datetime":
        log['clientTime'] = pd.to_datetime(client_time, unit='ms', origin='unix')
    elif date_type == "string":
        log['clientTime'] = str(client_time)

# Sort
sorted_data = sorted(data.items(), key=lambda kv: kv[1]['clientTime'])
sorted_dict = dict(sorted_data)

## Making Toy Segments ##

In [35]:
segments = distill.generate_fixed_time_segments(sorted_dict, 5, label="generated")

print(segments)

Segments: [
Segment: name=generated0, num_logs=3, start=2021-06-14 17:31:30.656000, end=2021-06-14 17:31:35.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated1, num_logs=0, start=2021-06-14 17:31:35.656000, end=2021-06-14 17:31:40.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated2, num_logs=9, start=2021-06-14 17:31:40.656000, end=2021-06-14 17:31:45.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated3, num_logs=7, start=2021-06-14 17:31:45.656000, end=2021-06-14 17:31:50.656000, type=Segment_Type.FIXED_TIME
]


### Returning Segments object ###

In [36]:
type(segments)

distill.segmentation.segments.Segments

### Iteration ###

In [37]:
for segment in segments:
    print(segment.get_segment_name())

generated0
generated1
generated2
generated3


In [38]:
for segment in segments:
    print(segment.get_num_logs())

3
0
9
7


### List Comprehensions should work as normal directly with the `Segments` object ###

In [39]:
number_of_logs = [segment.num_logs for segment in segments]
print(number_of_logs)

[3, 0, 9, 7]


In [40]:
segment_names = [segment.segment_name for segment in segments]
print(segment_names)

['generated0', 'generated1', 'generated2', 'generated3']


### Using Subscripts ###

In [41]:
print(segments[0].get_segment_name())
print(segments[1].get_segment_name())
print(segments[2].get_segment_name())
print(segments[3].get_segment_name())

generated0
generated1
generated2
generated3


### `Segments` are Mutable ###

In [42]:
segments[1].segment_name = "new_name"

print(segments)

Segments: [
Segment: name=generated0, num_logs=3, start=2021-06-14 17:31:30.656000, end=2021-06-14 17:31:35.656000, type=Segment_Type.FIXED_TIME
Segment: name=new_name, num_logs=0, start=2021-06-14 17:31:35.656000, end=2021-06-14 17:31:40.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated2, num_logs=9, start=2021-06-14 17:31:40.656000, end=2021-06-14 17:31:45.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated3, num_logs=7, start=2021-06-14 17:31:45.656000, end=2021-06-14 17:31:50.656000, type=Segment_Type.FIXED_TIME
]


### `Segments` Filtering ###

In [43]:
# Let's say we don't want segments with less than 3 logs
num_logs_segments = segments.get_num_logs(3)

print(num_logs_segments)

Segments: [
Segment: name=generated0, num_logs=3, start=2021-06-14 17:31:30.656000, end=2021-06-14 17:31:35.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated2, num_logs=9, start=2021-06-14 17:31:40.656000, end=2021-06-14 17:31:45.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated3, num_logs=7, start=2021-06-14 17:31:45.656000, end=2021-06-14 17:31:50.656000, type=Segment_Type.FIXED_TIME
]


In [44]:
# We may only want segments that happened before a given time
results = segments.get_segments_before(1623691905656)
print(results)

TypeError: '<' not supported between instances of 'Timestamp' and 'int'

In [45]:
# Maybe we only want segments of a certain segment type
segment_type = segments.get_segments_of_type(distill.Segment_Type.FIXED_TIME)
print(segment_type)

Segments: [
Segment: name=generated0, num_logs=3, start=2021-06-14 17:31:30.656000, end=2021-06-14 17:31:35.656000, type=Segment_Type.FIXED_TIME
Segment: name=new_name, num_logs=0, start=2021-06-14 17:31:35.656000, end=2021-06-14 17:31:40.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated2, num_logs=9, start=2021-06-14 17:31:40.656000, end=2021-06-14 17:31:45.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated3, num_logs=7, start=2021-06-14 17:31:45.656000, end=2021-06-14 17:31:50.656000, type=Segment_Type.FIXED_TIME
]


In [46]:
# We could also modify the orig segments object itself
segments = segments.get_num_logs(3)
print(segments)

Segments: [
Segment: name=generated0, num_logs=3, start=2021-06-14 17:31:30.656000, end=2021-06-14 17:31:35.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated2, num_logs=9, start=2021-06-14 17:31:40.656000, end=2021-06-14 17:31:45.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated3, num_logs=7, start=2021-06-14 17:31:45.656000, end=2021-06-14 17:31:50.656000, type=Segment_Type.FIXED_TIME
]


### `Segment` objects can be appended and deleted from the `Segments` objects ###

In [47]:
segment = segments[1]
print(segment)

Segment: name=generated2, num_logs=9, start=2021-06-14 17:31:40.656000, end=2021-06-14 17:31:45.656000, type=Segment_Type.FIXED_TIME


In [48]:
# Remove the segment via the segment_name
segments.delete("generated2")
print(segments)

Segments: [
Segment: name=generated0, num_logs=3, start=2021-06-14 17:31:30.656000, end=2021-06-14 17:31:35.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated3, num_logs=7, start=2021-06-14 17:31:45.656000, end=2021-06-14 17:31:50.656000, type=Segment_Type.FIXED_TIME
]


In [49]:
# Add the segment back
segments.append(segment)
print(segments)

Segments: [
Segment: name=generated0, num_logs=3, start=2021-06-14 17:31:30.656000, end=2021-06-14 17:31:35.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated3, num_logs=7, start=2021-06-14 17:31:45.656000, end=2021-06-14 17:31:50.656000, type=Segment_Type.FIXED_TIME
Segment: name=generated2, num_logs=9, start=2021-06-14 17:31:40.656000, end=2021-06-14 17:31:45.656000, type=Segment_Type.FIXED_TIME
]


## Return different data structures ##

### Dictionary: What we had before but might be obsolete now... ###

In [50]:
segments_dict = segments.get_segment_name_dict()

for segment_name in segments_dict:
    print("key=" + str(segment_name) + ", value=" + str(segments_dict[segment_name]))

key=generated0, value=Segment: name=generated0, num_logs=3, start=2021-06-14 17:31:30.656000, end=2021-06-14 17:31:35.656000, type=Segment_Type.FIXED_TIME
key=generated3, value=Segment: name=generated3, num_logs=7, start=2021-06-14 17:31:45.656000, end=2021-06-14 17:31:50.656000, type=Segment_Type.FIXED_TIME
key=generated2, value=Segment: name=generated2, num_logs=9, start=2021-06-14 17:31:40.656000, end=2021-06-14 17:31:45.656000, type=Segment_Type.FIXED_TIME


### List of Segments ###

In [51]:
# Still prints as the object since we are printing the list
segments_list = segments.get_segment_list()
print(segments_list)

[<distill.segmentation.segment.Segment object at 0x11cf46e50>, <distill.segmentation.segment.Segment object at 0x11cf46be0>, <distill.segmentation.segment.Segment object at 0x11cf46cd0>]


# Displaying segments with Plotly 

## Let’s now define a function that will allow us to visualize Segments objects

In [52]:
def display_segments(segments):
    distill.export_segments("./test.csv",segments)
    df = pd.read_csv("./test.csv")
    fig = px.timeline(df, x_start="Start Time", x_end="End Time", y="Segment Name", color="Number of Logs")
    fig.update_yaxes(autorange="reversed")
    os.remove("./test.csv")
    fig.show()

In [53]:
display_segments(segments)

In [54]:
segments.delete("generated3")

In [55]:
display_segments(segments)