First, we import necessary libraries:

In [1]:
import numpy as np
import pandas as pd

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

# Data Loading

In [4]:
"""
This loads the training and test data, preprocesses it, removes the NaN
values and interpolates the missing data using imputation

Parameters
----------
Compute
----------
X_train: matrix of floats, training input with features
y_train: array of floats, training output with labels
X_test: matrix of floats: dim = (100, ?), test input with features
"""
# Load training data
train_df = pd.read_csv("train.csv")
    
print("Training data:")
print("Shape:", train_df.shape)
print(train_df.head(2))
print('\n')
    
# Load test data
test_df = pd.read_csv("test.csv")

print("Test data:")
print(test_df.shape)
print(test_df.head(2))

imp = IterativeImputer(max_iter=10, random_state=0)

X_train_no_season = train_df.drop(['season'], axis=1)
X_test_no_season = test_df.drop(['season'], axis=1)

enc = OrdinalEncoder()

train_season = np.array(train_df['season'].values.reshape(-1,1))
test_season = np.array(test_df['season'].values.reshape(-1,1))

encoded_train_season = enc.fit_transform(train_season)
encoded_test_season = enc.fit_transform(test_season)

X_train = np.c_[encoded_train_season, X_train_no_season]
X_test = np.c_[encoded_test_season, X_test_no_season]

imp.fit(X_train)
X_train = imp.transform(X_train)

imp.fit(X_test)
X_test = imp.transform(X_test)


y_train = X_train[:,2]
X_train = np.delete(X_train, 2, axis=1)

assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"

Training data:
Shape: (900, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  


Test data:
(100, 10)
   season  price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK  \
0  spring        NaN   0.472985   0.707957        NaN  -1.136441 -0.596703   
1  summer  -1.184837   0.358019        NaN  -3.199028  -1.069695       NaN   

   price_ITA  price_POL  price_SVK  
0        NaN   3.298693   1.921886  
1  -1.420091   3.238307        NaN  


# Modeling and Prediction

In [5]:
"""
This defines the model, fits training data and then does the prediction
with the test data 

Parameters
----------
X_train: matrix of floats, training input with 10 features
y_train: array of floats, training output
X_test: matrix of floats: dim = (100, ?), test input with 10 features

Compute
----------
y_test: array of floats: dim = (100,), predictions on test set
"""

y_pred=np.zeros(X_test.shape[0])
#Define the model and fit it using training data. Then, use test data to make predictions
kernels = [DotProduct(), RBF(), Matern(), RationalQuadratic()]
n_kernels = len(kernels)
n_folds = 5
kf = KFold(n_splits=n_folds)

kernel_r2 = np.zeros((n_kernels, n_folds))

for i in range(n_kernels):
    j = 0
    for train, test in kf.split(X_train):
        gpr = GaussianProcessRegressor(kernel=kernels[i])
        gpr.fit(X_train[train], y_train[train])
        predicted = gpr.predict(X_train[test])
        kernel_r2[i,j] = r2_score(y_train[test], predicted)
        j = j+1

best_kernel = kernels[np.argmin(np.mean(kernel_r2, axis=0))]
print(best_kernel)

#final prediction with best kernel
y_pred = gpr.predict(X_test)

assert y_pred.shape == (100,), "Invalid data shape"

Matern(length_scale=1, nu=1.5)


# Saving Results
You don't have to change this

In [6]:
dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results.csv', index=False)
print("\nResults file successfully generated!")


Results file successfully generated!
