In [1]:
import pandas as pd
import numpy as np
import os, sys
import requests
import h5py

In [2]:
out_dir = "/home/giacomo/Downloads"

## Protein dataset (45730, 9)

In [3]:
protein_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00265/CASP.csv"
protein_file = os.path.join(out_dir, 'protein.csv')

r = requests.get(protein_url, allow_redirects=True)
with open(protein_file, 'wb') as fh:
    fh.write(r.content)

In [4]:
df = pd.read_csv(protein_file, index_col=None)
df.head()

Unnamed: 0,RMSD,F1,F2,F3,F4,F5,F6,F7,F8,F9
0,17.284,13558.3,4305.35,0.31754,162.173,1872791.0,215.359,4287.87,102,27.0302
1,6.021,6191.96,1623.16,0.26213,53.3894,803446.7,87.2024,3328.91,39,38.5468
2,9.275,7725.98,1726.28,0.22343,67.2887,1075648.0,81.7913,2981.04,29,38.8119
3,15.851,8424.58,2368.25,0.28111,67.8325,1210472.0,109.439,3248.22,70,39.0651
4,7.962,7460.84,1736.94,0.2328,52.4123,1021020.0,94.5234,2814.42,41,39.9147


In [5]:
df = df.astype(float)
Y = df["RMSD"].values.reshape(-1, 1)
X = df[["F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9"]].values

In [6]:
assert X.shape == (45730, 9)
assert Y.shape == (45730, 1)

In [7]:
protein_hdf_file = os.path.join(out_dir, 'protein.hdf5')
with h5py.File(protein_hdf_file, 'w') as hf: 
    dset = hf.create_dataset("X", data=X, dtype=np.float64, compression='gzip')
    dset = hf.create_dataset("Y", data=Y, dtype=np.float64, compression='gzip')

## Boston housing dataset (506, 13)

In [8]:
boston_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
boston_file = os.path.join(out_dir, 'boston.tsv')

r = requests.get(boston_url, allow_redirects=True)
with open(boston_file, 'wb') as fh:
    fh.write(r.content)

In [9]:
df = pd.read_csv(boston_file, index_col=None, delim_whitespace=True, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [10]:
df = df.astype(float)
Y = df[0].values.reshape(-1, 1)
X = df.drop(0, axis=1).values

In [11]:
assert X.shape == (506, 13)
assert Y.shape == (506, 1)

In [12]:
boston_hdf_file = os.path.join(out_dir, 'boston.hdf5')
with h5py.File(boston_hdf_file, 'w') as hf: 
    dset = hf.create_dataset("X", data=X, dtype=np.float64, compression='gzip')
    dset = hf.create_dataset("Y", data=Y, dtype=np.float64, compression='gzip')

## Energy dataset (768, 8)

There are two Y possibilities. We take Y1 (heating load).

In [13]:
energy_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
energy_file = os.path.join(out_dir, 'energy.xlsx')

r = requests.get(energy_url, allow_redirects=True)
with open(energy_file, 'wb') as fh:
    fh.write(r.content)

In [14]:
df = pd.read_excel(energy_file, engine='openpyxl', convert_float=False)
df = df.drop(["Unnamed: 10", "Unnamed: 11"], axis=1)
df = df.dropna(axis=0, how='all')
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
0,0.98,514.5,294.0,110.25,7.0,2.0,0.0,0.0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3.0,0.0,0.0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4.0,0.0,0.0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5.0,0.0,0.0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2.0,0.0,0.0,20.84,28.28


In [15]:
df = df.astype(float)
Y = df["Y1"].values.reshape(-1, 1)  # heating load
X = df.drop(["Y1", "Y2"], axis=1).values

In [16]:
assert X.shape == (768, 8)
assert Y.shape == (768, 1)

In [17]:
energy_hdf_file = os.path.join(out_dir, 'energy.hdf5')
with h5py.File(energy_hdf_file, 'w') as hf: 
    dset = hf.create_dataset("X", data=X, dtype=np.float64, compression='gzip')
    dset = hf.create_dataset("Y", data=Y, dtype=np.float64, compression='gzip')

## kin40k

Data is impossible to find from reputable sources. Delve repository does not have 40k points (only 8192).

Github repository with full data: https://github.com/trungngv/fgp/blob/master/data/kin40k/kin40k_test_data.asc

In [18]:
kin40k_folder = "/home/giacomo/Downloads/kin40k"

url_test_y  = "https://github.com/trungngv/fgp/raw/master/data/kin40k/kin40k_test_labels.asc"
url_train_y = "https://github.com/trungngv/fgp/raw/master/data/kin40k/kin40k_train_labels.asc"
url_test_x  = "https://github.com/trungngv/fgp/raw/master/data/kin40k/kin40k_test_data.asc"
url_train_x = "https://github.com/trungngv/fgp/raw/master/data/kin40k/kin40k_train_data.asc"
f_test_y = os.path.join(kin40k_folder, "kin40k_test_labels.asc")
f_train_y = os.path.join(kin40k_folder, "kin40k_train_labels.asc")
f_test_x = os.path.join(kin40k_folder, "kin40k_test_data.asc")
f_train_x = os.path.join(kin40k_folder, "kin40k_train_data.asc")
for (url, file) in [(url_test_y, f_test_y), (url_train_y, f_train_y), 
                    (url_test_x, f_test_x), (url_train_x, f_train_x)]:
    r = requests.get(url, allow_redirects=True)
    with open(file, 'wb') as fh:
        fh.write(r.content)

In [19]:
test_y = pd.read_fwf(f_test_y, header=None, index_col=None).astype(float).values.reshape(-1, 1)
train_y = pd.read_fwf(f_train_y, header=None, index_col=None).astype(float).values.reshape(-1, 1)
test_x = pd.read_fwf(f_test_x, header=None, index_col=None).astype(float).values
train_x = pd.read_fwf(f_train_x, header=None, index_col=None).astype(float).values

In [20]:
assert test_y.shape == (30_000, 1)
assert train_y.shape == (10_000, 1)
assert test_x.shape == (30_000, 8)
assert train_x.shape == (10_000, 8)

In [21]:
kin40k_hdf_file = os.path.join(out_dir, 'kin40k.hdf5')
with h5py.File(kin40k_hdf_file, 'w') as hf: 
    dset = hf.create_dataset("Y_test", data=test_y, dtype=np.float64, compression='gzip')
    dset = hf.create_dataset("Y_train", data=train_y, dtype=np.float64, compression='gzip')
    dset = hf.create_dataset("X_test", data=test_x, dtype=np.float64, compression='gzip')
    dset = hf.create_dataset("X_train", data=train_x, dtype=np.float64, compression='gzip')