In [2]:
from hummingbird.ml import convert

In [3]:
import zipfile
import urllib.request as urllib
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip'

filehandle, _ = urllib.urlretrieve(url)
zip_file_object = zipfile.ZipFile(filehandle, 'r')
filename = zip_file_object.namelist()[0]
bytes_data = zip_file_object.open(filename).read()

In [5]:
import pandas as pd
from io import BytesIO
from sklearn.model_selection import train_test_split

year = pd.read_csv(BytesIO(bytes_data), header = None)

#train_size = 463715  # Note: this will extend the training time if we do the full dataset
train_size = 200000
X = year.iloc[:, 1:]
y = year.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, train_size=train_size, test_size=51630)

# Store the test data as numpy by pulling the values out of the pandas dataframe
data = X_test.values

In [7]:
# peak at the data if desired
X_test

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,81,82,83,84,85,86,87,88,89,90
200000,45.09540,-57.29138,20.05392,0.10414,12.00346,-14.31259,2.54547,-3.33986,-3.60175,-8.99414,...,7.15503,-81.48297,34.91722,15.11917,-6.09056,-78.52893,-46.87559,2.89571,64.25949,0.78578
200001,45.11673,-18.39958,-1.61552,-3.67929,-13.24027,-6.84375,-10.33838,-11.12891,16.56924,6.70243,...,8.26246,-138.26547,23.59451,60.99156,4.18891,105.75496,-126.39851,-3.80727,55.00532,-3.42354
200002,46.85191,9.44824,0.31518,-16.85413,15.42389,-15.82587,-21.21385,-10.16067,14.45113,2.22865,...,13.24783,-91.25475,41.12300,55.22389,8.33048,7.57355,17.34516,0.21543,-58.34520,3.92760
200003,45.79644,-36.86230,21.99320,-10.42360,-2.89410,-8.84010,-23.73864,-9.82956,13.08399,0.57577,...,8.13403,-194.21155,55.55883,21.65387,6.41164,-47.18867,-212.14270,-4.67550,-86.99988,-6.12034
200004,40.92442,-43.26026,-18.72100,-11.72495,-19.69395,-10.54229,-9.91945,-14.85633,9.37409,-0.93093,...,17.63167,-203.56276,27.10714,36.90795,0.62431,-40.23377,-83.22141,4.44391,73.15568,3.25023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251625,51.51566,57.56765,-2.64857,-4.91725,-23.56405,0.85009,-17.36973,11.53281,-0.94416,-0.44701,...,19.47616,-55.85795,-73.33446,44.67472,-4.15276,149.79434,-113.40535,7.92503,141.56348,8.79724
251626,49.94868,48.06760,-3.65467,-8.93936,-29.64670,-2.42913,-11.37464,7.05295,15.04007,13.68497,...,6.04067,-261.29108,-90.93597,58.34540,-6.57932,36.79059,32.19586,1.45334,140.34302,-1.44199
251627,45.40128,38.26969,-15.07650,-3.31269,0.87066,-8.09099,-14.67702,3.66075,5.14198,0.65322,...,20.65412,-143.53649,-33.92068,196.54495,26.27162,115.49923,-130.81155,4.71753,285.25134,3.44482
251628,44.06831,10.85663,-28.45948,31.90792,5.14640,-23.67390,7.89138,2.79403,1.70776,12.41229,...,9.08009,-143.41301,-188.23546,-87.95495,-7.37069,50.86357,197.18905,-0.10796,40.65367,-5.44536


#### Train the model  (Note, this may take a bit of time for larger values of _num_est_. )

In [9]:
from sklearn.ensemble import RandomForestRegressor
num_est=2

skl_model = RandomForestRegressor(n_estimators=num_est, max_depth=8)
skl_model.fit(X_train, y_train)

#### scikit-learn (CPU only)

In [10]:
skl_time = %timeit -o skl_model.predict(data)

15.5 ms ± 375 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### Convert scikit-learn model to PyTorch

In [11]:
model = convert(skl_model, 'torch')

#### Time PyTorch - CPU

In [12]:
pred_cpu_hb = %timeit -o model.predict(data)

22.6 ms ± 635 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### Switch PyTorch from CPU to GPU

In [13]:
%%capture 
model.to('cuda')

#### Time PyTorch - GPU

In [14]:
pred_gpu_hb = %timeit -o model.predict(data)

9.1 ms ± 767 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
