# Load occupancy data: RC-60-Mix

In [1]:
import numpy as np 
import pandas as pd

In [2]:
root_folder = 'preprocessed_data/'
f_x = root_folder + '058_5p_to_1p_X.txt'
f_y = root_folder + '058_5p_to_1p_y.txt'
data = pd.read_csv(f_x, delimiter=' ', header = None)
labels = pd.read_csv(f_y, delimiter=' ', header = None)
print(data.shape, labels.shape)

(4340, 5976) (4340, 1)


In [3]:
labels.value_counts()
# label 1: 5p
# label 2: 4p
# label 3: 3p
# label 4: 2p
# label 5: 1p

1    902
3    902
4    902
2    852
5    782
dtype: int64

In [4]:
def count_HOV_LOV(labels):# HOV: label=1 ; LOV: label=2
    count_hov = 0
    count_lov = 0    
    if len(labels.shape)==2:
        labels = np.squeeze(labels).tolist()
    for idx, l in enumerate(labels):
        if l==1:
            count_hov+=1
        elif l==2:
            count_lov+=1
        else:
            raise SystemExit("should have two unique labels in HOV_LOV problem!")
    print('HOV : {0}'.format(count_hov))
    print('LOV : {0}'.format(count_lov))
    

In [6]:
def transfer_to_HOV_LOV(labels):# HOV: label=1 ; LOV: label=2
    if len(labels.shape)==2:
        labels = np.squeeze(labels).tolist()
    for idx, l in enumerate(labels):
        if l == 1:#5people: HOV
            labels[idx]=1
        elif l==2:#4people: HOV
            labels[idx]=1
        elif l==3:#3people
            labels[idx]=1
        elif l==4:#2people
            labels[idx]=2
        elif l==5:
            labels[idx]=2
        else:
            raise SystemExit("invalid label")
    labels = np.array(labels)
    labels = np.expand_dims(labels, axis=1)
    return labels

In [7]:
hov_lov_labels = transfer_to_HOV_LOV(labels)

In [8]:
count_HOV_LOV(hov_lov_labels)

HOV : 2656
LOV : 1684


# RC-60-Mix: extract subdata by each label and produce 2d data

In [9]:
from utils.sampling import extract_car_data, generate_2d_data, get_all_cars_2d

In [10]:
bin_length, stride = 6, 1
total_bin = None

In [11]:
unique_labels = [1, 2, 3, 4, 5]
aggregate_data, aggregate_label = get_all_cars_2d(data, labels, unique_labels, bin_length, stride, total_bin)

In [12]:
aggregate_data.shape, aggregate_label.shape

((4310, 6, 5976), (4310,))

# Transfer exact occupancy labels to HOV/LOV
* (1) LOV: 1 or 2 people => label = 2 
* (2) HOV: 3, 4, or 5 people => label = 1

In [13]:
new_aggregate_label = transfer_to_HOV_LOV(aggregate_label.copy())

In [14]:
min(new_aggregate_label), max(new_aggregate_label)

(array([1]), array([2]))

In [15]:
# after generating 2d data
count_HOV_LOV(new_aggregate_label)

HOV : 2638
LOV : 1672


In [16]:
# imbalance ratio
1672/(2638+1672)

0.38793503480278424

In [17]:
type(aggregate_data), type(new_aggregate_label)

(numpy.ndarray, numpy.ndarray)

# Oversampling (re-balancing) LOV to match HOV

## (1) shuffle HOV and LOV sub-datasets

In [18]:
def extract_HOV_LOV_indices(new_aggregate_label):
    df = pd.DataFrame(new_aggregate_label)
    HOV_index = df.index[df.iloc[:, 0] == 1].tolist()
    LOV_index = df.index[df.iloc[:, 0] == 2].tolist()
    return HOV_index, LOV_index

In [19]:
HOV_index, LOV_index = extract_HOV_LOV_indices(new_aggregate_label)
HOV_data, LOV_data = aggregate_data[HOV_index], aggregate_data[LOV_index]
HOV_labels, LOV_labels = new_aggregate_label[HOV_index], new_aggregate_label[LOV_index]

In [20]:
HOV_data.shape, LOV_data.shape

((2638, 6, 5976), (1672, 6, 5976))

In [22]:
HOV_labels.shape, LOV_labels.shape

((2638, 1), (1672, 1))

In [24]:
import tensorflow as tf
BUFFER_SIZE = 1000

def make_ds(data, labels, num_classes):
    labels = tf.keras.utils.to_categorical(labels-1, num_classes)
    ds = tf.data.Dataset.from_tensor_slices((data, labels))#.cache()
    ds = ds.shuffle(BUFFER_SIZE).repeat()
    return ds
num_classes = 2
HOV_ds = make_ds(HOV_data, HOV_labels, num_classes)
LOV_ds = make_ds(LOV_data, LOV_labels, num_classes)

2023-07-10 18:11:53.075085: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-10 18:11:55.396089: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38251 MB memory:  -> device: 0, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:2f:00.0, compute capability: 8.0


## (2) resampling HOV and LOV

In [25]:
BATCH_SIZE = 64
resampled_ds = tf.data.Dataset.sample_from_datasets([HOV_ds, LOV_ds], weights=[0.5, 0.5])

In [27]:
for features, label in resampled_ds.take(1):
    print(label) 

tf.Tensor([0. 1.], shape=(2,), dtype=float32)


## (3) take 0.8:0.2 to train:test

In [None]:
test_dataset = resampled_ds.take(*0.2)
train_dataset = resampled_ds.skip(1000)

In [29]:
resampled_ds

<_DirectedInterleaveDataset element_spec=(TensorSpec(shape=(6, 5976), dtype=tf.float64, name=None), TensorSpec(shape=(2,), dtype=tf.float32, name=None))>

# Train 2d cnn on RC-60-Mix

In [48]:
from das_lib.models import cnn2d_dropout
from das_lib.funs import split_train_test_2d

In [None]:
from das_lib.funs import *
test_frac = .2
aggregate_data= np.expand_dims(aggregate_data,-1)
y_one_hot = tf.keras.utils.to_categorical(new_aggregate_label-1, num_classes=2)
x_train, x_test, y_train, y_test, training_index = split_train_test_2d(aggregate_data, y_one_hot, test_frac)
m = cnn2d_dropout( (x_train.shape[1], x_train.shape[2],1), y_train.shape[1])
print(m.summary())