In [13]:
import GPy
import pandas as pd
import numpy as np
import sklearn.preprocessing as preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
np.random.seed(11)

In [3]:
dataset_path = r'dataset/airline.pickle'
ds = pd.read_pickle(dataset_path)

# Extract a subset
ds = ds.loc[ds['Month'] <= 4]

# Convert time of day from hhmm to minutes since midnight
ds.ArrTime = 60*np.floor(ds.ArrTime/100)+np.mod(ds.ArrTime, 100)
ds.DepTime = 60*np.floor(ds.DepTime/100)+np.mod(ds.DepTime, 100)

N = len(ds)

ds.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,ArrDelay,AirTime,Distance,plane_age
0,2008,1,3,4,1203.0,1331.0,-14.0,116.0,810,10.0
1,2008,1,3,4,454.0,598.0,-22.0,314.0,2283,10.0
2,2008,1,3,4,652.0,963.0,-17.0,175.0,1521,10.0
3,2008,1,3,4,1013.0,1172.0,2.0,79.0,577,10.0
4,2008,1,4,5,818.0,880.0,10.0,48.0,239,10.0


In [4]:
y = ds['ArrDelay'].values[:, np.newaxis]
names = ['Month', 'DayofMonth', 'DayOfWeek', 'plane_age', 'AirTime', 'Distance', 'ArrTime', 'DepTime']
X = ds[names].values

# X shuffling
perm = np.random.permutation(N)
X = X[perm]
y = y[perm]

In [5]:
X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=100000, random_state=42)
X_tr, X_vs, y_tr, y_vs = train_test_split(X_tr, y_tr, test_size=100000, random_state=22)

scaler = preprocessing.StandardScaler().fit(X_tr)
X_tr = scaler.transform(X_tr)
X_ts = scaler.transform(X_ts)
X_vs = scaler.transform(X_vs)

In [6]:
del ds, X, y

In [25]:
kernel = GPy.kern.RBF(input_dim=X_tr.shape[1])
m = GPy.models.GPRegression(X_tr[0:1000], y_tr[0:1000], kernel)
m.optimize(messages=False, max_iters=100)
m

GP_regression.,value,constraints,priors
rbf.variance,1125.0343462520782,+ve,
rbf.lengthscale,9.008129940179371e-29,+ve,
Gaussian_noise.variance,726.0664996486445,+ve,


In [22]:
prediction = m.predict(X_ts)[0]
RMSE = np.sqrt(mean_squared_error(y_ts, prediction))

40.2815139983591