# Data Exploration for Electric dataset

In [42]:
import numpy as np
import pandas as pd
import os

from torchvision.datasets.utils import download_url, download_and_extract_archive
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## `Electric` from `SSDKL`

In [43]:
def load_dataset_from_ssdkl(path_to_folder):
    X = np.load(os.path.join(path_to_folder, "X.npy"))
    y = np.load(os.path.join(path_to_folder, "y.npy"))

    return X, y

PATH_TO_DATASET = "/home/flo/ssdgm/notebooks/datasets/SSDKL/electric"
X_ssdkl, y_ssdkl = load_dataset_from_ssdkl(PATH_TO_DATASET)
X_ssdkl.shape, y_ssdkl.shape

((2049280, 6), (2049280,))

## `Electric` from `UCI`

In [44]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip"
DOWNLOAD_FOLDER = "/home/flo/ssdgm/notebooks/datasets/UCI/Electric/raw"

download_and_extract_archive(url=URL, download_root=DOWNLOAD_FOLDER)
os.remove(os.path.join(DOWNLOAD_FOLDER, os.path.basename(URL)))

Downloading https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip to /home/flo/ssdgm/notebooks/datasets/UCI/Electric/raw/household_power_consumption.zip


  0%|          | 0/20640916 [00:00<?, ?it/s]

Extracting /home/flo/ssdgm/notebooks/datasets/UCI/Electric/raw/household_power_consumption.zip to /home/flo/ssdgm/notebooks/datasets/UCI/Electric/raw


In [58]:
FILE = "/home/flo/ssdgm/notebooks/datasets/UCI/Electric/raw/household_power_consumption.txt"
df = pd.read_csv(FILE, sep=";")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [59]:
df["Global_active_power"][0]

'4.216'

In [60]:
df.shape

(2075259, 9)

In [61]:
df.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [62]:
df = df.drop(["Date", "Time"], axis=1)
df = df.dropna()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,4.216,0.418,234.840,18.400,0.000,1.000,17.0
1,5.360,0.436,233.630,23.000,0.000,1.000,16.0
2,5.374,0.498,233.290,23.000,0.000,2.000,17.0
3,5.388,0.502,233.740,23.000,0.000,1.000,17.0
4,3.666,0.528,235.680,15.800,0.000,1.000,17.0
...,...,...,...,...,...,...,...
2075254,0.946,0.0,240.43,4.0,0.0,0.0,0.0
2075255,0.944,0.0,240.0,4.0,0.0,0.0,0.0
2075256,0.938,0.0,239.82,3.8,0.0,0.0,0.0
2075257,0.934,0.0,239.7,3.8,0.0,0.0,0.0


In [65]:
df = df.apply(pd.to_numeric)


In [66]:
df.dtypes

Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float64
Sub_metering_2           float64
Sub_metering_3           float64
dtype: object

In [15]:
df.isna().any()

Global_active_power      False
Global_reactive_power    False
Voltage                  False
Global_intensity         False
Sub_metering_1           False
Sub_metering_2           False
Sub_metering_3            True
dtype: bool

In [17]:
df = df.dropna()
df.shape

In [25]:
df.iloc[:, 1:]

Unnamed: 0,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,0.418,234.840,18.400,0.000,1.000,17.0
1,0.436,233.630,23.000,0.000,1.000,16.0
2,0.498,233.290,23.000,0.000,2.000,17.0
3,0.502,233.740,23.000,0.000,1.000,17.0
4,0.528,235.680,15.800,0.000,1.000,17.0
...,...,...,...,...,...,...
2075254,0.0,240.43,4.0,0.0,0.0,0.0
2075255,0.0,240.0,4.0,0.0,0.0,0.0
2075256,0.0,239.82,3.8,0.0,0.0,0.0
2075257,0.0,239.7,3.8,0.0,0.0,0.0


In [41]:
scaler = StandardScaler()
scaler.fit_transform(df.iloc[:1000000, :])

array([[ 2.77451546,  2.71497052, -1.54312358, ..., -0.1858494 ,
        -0.07771794,  1.36085175],
       [ 3.79123494,  2.87850713, -1.9134015 , ..., -0.1858494 ,
        -0.07771794,  1.2386045 ],
       [ 3.80367731,  3.4417999 , -2.01744653, ..., -0.1858494 ,
         0.07854143,  1.36085175],
       ...,
       [ 0.44956952,  0.00753106,  1.16816762, ...,  0.13083904,
        -0.23397731,  1.48309901],
       [ 0.41757486, -0.24685922,  1.21712998, ..., -0.02750518,
        -0.23397731,  1.60534626],
       [ 0.40157752, -0.31954216,  1.07330303, ..., -0.02750518,
        -0.23397731,  1.48309901]])

In [33]:
df.iloc[:, 0].apply(lambda x: float(x)).std()

1.057294161093983

In [19]:
X_ssdkl

array([[ 2.61072061, -1.85181608,  3.09878851, -0.18233673, -0.05127425,
         1.24942076],
       [ 2.77040557, -2.2252745 ,  4.13379998, -0.18233673, -0.05127425,
         1.13089735],
       [ 3.32043153, -2.33021323,  4.13379998, -0.18233673,  0.12048729,
         1.24942076],
       ...,
       [-1.09751893, -0.31477235, -0.18624791, -0.18233673, -0.22303579,
        -0.7654772 ],
       [-1.09751893, -0.35180955, -0.18624791, -0.18233673, -0.22303579,
        -0.7654772 ],
       [-1.09751893, -0.39810605, -0.18624791, -0.18233673, -0.22303579,
        -0.7654772 ]])

In [39]:
y_ssdkl.mean(), y_ssdkl.std()

(-3.319448678730157e-13, 0.9647821385712089)