# Data understanding

In this notebook we label the data and get deeper knowledge about the data.
We split the task. Each team member labels 50 samples.

In [1]:
# make sure just your username is uncommented
# user = "felix"
# user = "omar"
# user = "tobias"
user = "collab"

In [2]:
# For labelling the Framework pigeonXT is used: https://github.com/dennisbakhuis/pigeonXT/
# Check the pigeonXT github README, if it does not work out of the box.
!pip install pigeonXT-jupyter

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import json
import matplotlib.pyplot as plt
import numpy as np
import time
import pandas as pd
import pigeonXT as pixt
from IPython.display import clear_output

In [4]:
# select 150 random indices from our 10000 entries
np.random.seed(0)
import numpy as np
all_indices = np.random.choice(range(10001), 200, replace=False)
#print(all_indices.shape)

# assign 50 labels to each user
indices = { "felix":  all_indices[:50],
            "omar":   all_indices[50:100],
            "tobias": all_indices[100:150],
            "collab": all_indices[150:],
          }
print("samples to label:", indices[user].shape[0])

samples to label: 50


In [5]:
# load data
FILES=["../data/raw_data/thermal_raw_20210507_full/20210507_1605_3078.txt", "../data/raw_data/thermal_raw_20210507_full/20210507_1605_C088.txt"]
data = list()

for FILE in FILES:
    with open(FILE, "r") as f:
        for line in f:
            record = json.loads(line)
            data.append(record)

In [6]:
# convert data to pandas and do give it appropriate datatypes
df = pd.DataFrame(data)

# fix datatypes, drop sensor_size
df['Timestamp'] = df['Timestamp'].astype('int32')
df['Sensor ID'] = df['Sensor ID'].str.replace(r'Sensor_32x32_', r'')
df['Room Temperature'] = df['Room Temperature'].astype('float32')
df = df.drop(['Sensor size'], axis=1)

# reset index to get it as column
df.reset_index(level=0, inplace=True)

# show datatypes
#print(df.dtypes)

# show amount of samples
#print("amount of samples:", df.shape[0])

# show first 3 rows
#df.head(n=3)

In [7]:
# select samples for the user
# if user CSV file exists, continue with this one
df_user = df.loc[indices[user]]

# show amount of samples
print("amount of samples:", df.shape[0])
df_user.head(n=3)

amount of samples: 10000


Unnamed: 0,index,Timestamp,Sensor ID,Room Temperature,RSSI,data
521,521,1620392784,3078,21.299999,-67,"[[12.0, 10.4, 11.1, 10.9, 10.9, 12.0, 11.6, 11..."
2940,2940,1620393022,3078,22.4,-76,"[[11.8, 10.3, 11.2, 11.8, 10.6, 11.3, 11.4, 11..."
226,226,1620392755,3078,21.1,-68,"[[11.5, 10.5, 12.0, 12.5, 12.6, 13.0, 12.6, 12..."


In [8]:
# def show_image(index):
#     """ show single image for annotation
    
#     Args:
#     index: index of the image in df Dataframe
#     """
#     fig, ax = plt.subplots(figsize=(2, 2))
#     ax.set_axis_off()
#     plt.imshow(df.iloc[index].data, interpolation='nearest', aspect='auto')

In [9]:
# # run annotation
# annotations = pixt.annotate(
#   df_user,
#   example_column='index',
#   buttons_in_a_row=5,
#   options=['Human', 'Several humans', 'Non-human heat', 'Ambigous heat', 'No heat'],
#   display_fn=lambda index: show_image(index)
# )

In [10]:
def show_image_with_surrounding(index, n=4, step=25):
    """ show the image and surround images
    
    Args:
    index: index of the image in df Dataframe
    n: amount of pics to show prior and past the frame of interest
    step: amount of frames to skip between the preceeding / succeeding images
    """
    %matplotlib notebook
    fig, axes = plt.subplots(nrows=1, ncols=n, figsize=(n*2, 2), num="preceeding image")
    for ax, i in zip(axes, range(index-(n*step), index, step)):
        ax.set_axis_off()
        ax.set_title('Index: %i' % i)
        ax.imshow(df.iloc[i].data, interpolation='nearest', aspect='auto')
    
    fig, ax = plt.subplots(figsize=(2, 2), num="image to rate")
    ax.set_axis_off()
    ax.imshow(df.iloc[index].data, interpolation='nearest', aspect='auto')
    
    fig, axes = plt.subplots(nrows=1, ncols=n, figsize=(n*2, 2), num="succeeding image")
    for ax, i in zip(axes, range(index+1, index+(n*step)+1)):
        ax.set_axis_off()
        ax.set_title('Index: %i' % i)
        ax.imshow(df.iloc[i].data, interpolation='nearest', aspect='auto')

In [11]:
# run annotation and show surround images
annotations = pixt.annotate(
  df_user,
  example_column='index',
  buttons_in_a_row=5,
  options=['Human', 'Several humans', 'Non-human heat', 'Ambigous heat', 'No heat'],
  display_fn=lambda index: show_image_with_surrounding(index),
  example_process_fn=clear_output(wait=True)
)

HTML(value='0 of 50 Examples annotated, Current Position: 0 ')

VBox(children=(HBox(children=(Button(description='Human', style=ButtonStyle()), Button(description='Several hu…

Output()

In [16]:
# verify the data
annotations

Unnamed: 0,index,Timestamp,Sensor ID,Room Temperature,RSSI,data,changed,label
521,521,1620392784,3078,21.299999,-67,"[[12.0, 10.4, 11.1, 10.9, 10.9, 12.0, 11.6, 11...",True,Human
2940,2940,1620393022,3078,22.4,-76,"[[11.8, 10.3, 11.2, 11.8, 10.6, 11.3, 11.4, 11...",True,Human
226,226,1620392755,3078,21.1,-68,"[[11.5, 10.5, 12.0, 12.5, 12.6, 13.0, 12.6, 12...",True,Ambigous heat
3030,3030,1620393030,3078,22.4,-73,"[[10.9, 11.1, 11.2, 10.8, 12.0, 12.2, 12.7, 12...",True,Several humans
1427,1427,1620392874,3078,21.799999,-82,"[[10.8, 10.9, 10.7, 10.7, 11.7, 11.4, 11.2, 10...",True,Non-human heat
5951,5951,1620392823,C088,21.5,-65,"[[11.6, 12.4, 12.5, 12.8, 14.3, 12.8, 13.7, 13...",True,Non-human heat
8403,8403,1620393052,C088,22.5,-61,"[[12.7, 12.3, 12.0, 11.5, 12.9, 10.8, 11.7, 11...",True,Human
5609,5609,1620392790,C088,21.299999,-62,"[[12.0, 11.8, 12.6, 12.6, 13.3, 12.9, 13.6, 13...",True,Human
167,167,1620392750,3078,21.1,-74,"[[10.3, 12.3, 11.4, 10.8, 10.9, 12.1, 11.9, 12...",True,Human
1103,1103,1620392843,3078,21.6,-85,"[[11.8, 10.4, 10.9, 11.4, 11.5, 11.9, 12.2, 12...",True,Non-human heat


In [17]:
# write to CSV
annotations.to_json(user+"_"+str(int(time.time()))+"_labels.json")