# Data Exploration for Elevators dataset

In [7]:
import numpy as np
import pandas as pd
import os

from torchvision.datasets.utils import download_and_extract_archive
from sklearn.preprocessing import StandardScaler

## Elevators dataset from SSDKL paper

In [8]:
def load_dataset_from_ssdkl(path_to_folder):
    X = np.load(os.path.join(path_to_folder, "X.npy"))
    y = np.load(os.path.join(path_to_folder, "y.npy"))

    return X, y


In [9]:
PATH_TO_DATASET = "/home/flo/ssdgm/notebooks/datasets/SSDKL/elevators"
X_ssdkl, y_ssdkl = load_dataset_from_ssdkl(PATH_TO_DATASET)
X_ssdkl.shape, y_ssdkl.shape

## Elevators dataset from UCI repo

In [12]:
URL = "https://www.dcc.fc.up.pt/~ltorgo/Regression/elevators.tgz"
RAW_FOLDER = "/home/flo/ssdgm/notebooks/datasets/UCI/Elevators/raw"

In [13]:
download_and_extract_archive(url=URL, download_root=RAW_FOLDER, remove_finished=True)

Using downloaded and verified file: /home/flo/ssdgm/notebooks/datasets/UCI/Elevators/raw/elevators.tgz
Extracting /home/flo/ssdgm/notebooks/datasets/UCI/Elevators/raw/elevators.tgz to /home/flo/ssdgm/notebooks/datasets/UCI/Elevators/raw


In [14]:
os.remove(os.path.join(RAW_FOLDER, os.path.basename(URL)))

In [15]:
DATA_DIR = os.path.join(RAW_FOLDER, "Elevators")

In [16]:
FILENAME = os.path.basename(URL).split('.')[0]

DATAFILE = os.path.join(DATA_DIR, FILENAME + ".data")
TESTFILE = os.path.join(DATA_DIR, FILENAME + ".test")
DOMAINFILE = os.path.join(DATA_DIR, FILENAME + ".domain")

print(DATAFILE, TESTFILE, DOMAINFILE)

/home/flo/ssdgm/notebooks/datasets/UCI/Elevators/raw/Elevators/elevators.data /home/flo/ssdgm/notebooks/datasets/UCI/Elevators/raw/Elevators/elevators.test /home/flo/ssdgm/notebooks/datasets/UCI/Elevators/raw/Elevators/elevators.domain


In [17]:
df_domain = pd.read_csv(DOMAINFILE, header=None)


In [18]:
col_names = df_domain[0].apply(lambda col: str.split(col, ':')[0])

In [139]:
df_data = pd.read_csv(DATAFILE, names=col_names)
df_test = pd.read_csv(TESTFILE, names=col_names)
df_combined = pd.concat([df_data, df_test], axis=0)
df_combined.head()

Unnamed: 0,climbRate,Sgz,p,q,curRoll,absRoll,diffClb,diffRollRate,diffDiffClb,SaTime1,SaTime2,SaTime3,SaTime4,diffSaTime1,diffSaTime2,diffSaTime3,diffSaTime4,Sa,Goal
0,118,-55,-0.28,-0.08,-0.2,-11,11,0.005,-0.2,-0.001,-0.001,-0.001,-0.001,0.0,0.0,0.0,0.0,-0.001,0.031
1,390,-45,-0.06,-0.07,-0.6,-12,11,0.01,-0.2,-0.0008,-0.0008,-0.0008,-0.0008,0.0,0.0,0.0,0.0,-0.0008,0.034
2,68,6,0.11,0.15,0.6,-10,-9,-0.003,-0.2,-0.0011,-0.001,-0.001,-0.001,-0.0002,0.0,0.0,0.0,-0.001,0.033
3,-358,-12,-0.2,0.13,-0.3,-11,-7,0.001,-0.1,-0.001,-0.001,-0.001,-0.001,0.0,0.0,0.0,0.0,-0.001,0.032
4,-411,-19,-0.18,0.02,-0.5,-11,-3,0.002,1.2,-0.001,-0.001,-0.001,-0.001,0.0,0.0,0.0,0.0,-0.001,0.03


## Conclusion

- use `Goal` as target variable
- no cleaning steps necessary