# Task 2
This serves as a template which will guide you through the implementation of this task. It is advised to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps.
This is the jupyter notebook version of the template. For the python file version, please refer to the file `template_solution.py`.

First, we import necessary libraries:

In [1]:
import numpy as np
import pandas as pd
# Add any other imports you need here
from sklearn.impute import SimpleImputer, KNNImputer
from functools import reduce
from sklearn import linear_model
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import KFold

# Data Loading
TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test
(and potentially change initialization of variables to accomodate how you deal with non-numeric data)

In [2]:
"""
This loads the training and test data, preprocesses it, removes the NaN
values and interpolates the missing data using imputation

Parameters
----------
Compute
----------
X_train: matrix of floats, training input with features
y_train: array of floats, training output with labels
X_test: matrix of floats: dim = (100, ?), test input with features
"""
# Load training data
train_df = pd.read_csv("train.csv")
    
print("Training data:")
print("Shape:", train_df.shape)
print(train_df.head(2))
print('\n')
    
# Load test data
test_df = pd.read_csv("test.csv")

print("Test data:")
print(test_df.shape)
print(test_df.head(2))

Training data:
Shape: (900, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  


Test data:
(100, 10)
   season  price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK  \
0  spring        NaN   0.472985   0.707957        NaN  -1.136441 -0.596703   
1  summer  -1.184837   0.358019        NaN  -3.199028  -1.069695       NaN   

   price_ITA  price_POL  price_SVK  
0        NaN   3.298693   1.921886  
1  -1.420091   3.238307        NaN  


In [3]:
# Dummy initialization of the X_train, X_test and y_train   
# TODO: Depending on how you deal with the non-numeric data, you may want to 
# modify/ignore the initialization of these variables   

#drop rows with no chf values
train_df.dropna(subset=["price_CHF"], inplace=True)

X_train = np.zeros_like(train_df.drop(['price_CHF'],axis=1))
y_train = np.zeros_like(train_df['price_CHF'])
X_test = np.zeros_like(test_df)

# TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test
def preprocess(df):


  #add new columns "is_##" and make the column values binary
  df["is_spring"] = (df["season"]=="spring").map(lambda x : 1 if x else 0)
  df["is_summer"] = (df["season"]=="summer").map(lambda x : 1 if x else 0)
  df["is_autumn"] = (df["season"]=="autumn").map(lambda x : 1 if x else 0)
  df["is_winter"] = (df["season"]=="winter").map(lambda x : 1 if x else 0)
  dropped_season = df.drop("season", axis=1)

  imp = KNNImputer(n_neighbors=2, weights="uniform")
  SEASONS = ["spring", "summer", "autumn", "winter"]
  imputed_df = reduce(lambda df1, df2 : pd.concat([df1, df2]) , [pd.DataFrame(imp.fit_transform(dropped_season[dropped_season[f"is_{s}"] == 1]), columns=dropped_season.columns, index=dropped_season[dropped_season[f"is_{s}"] == 1].index) for s in SEASONS]).sort_index()
  return imputed_df


imputed_train = preprocess(train_df)
y_train = imputed_train["price_CHF"]
X_train = imputed_train.drop("price_CHF", axis=1)
X_test = preprocess(test_df)

assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"

In [4]:
print(X_train)
print(y_train)

     price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK  \
0    -2.127053  -1.686248  -1.748076  -3.666005  -4.030793 -1.822720   
1    -2.087594  -2.132377  -2.054363  -3.295697  -4.104759 -1.826021   
2    -2.101937  -1.910282  -1.955927  -3.388777  -4.120762 -2.034409   
3    -2.098475  -1.903834  -2.002881  -3.588235  -4.041085 -2.214720   
4    -1.969687  -1.697257  -1.331049  -3.604443  -3.911096 -2.388092   
..         ...        ...        ...        ...        ...       ...   
889  -0.650417   0.235917  -0.306331  -3.426546  -2.735177 -1.839910   
890  -1.079859   0.420600  -0.226806  -3.408693  -2.597671 -1.807515   
891  -0.961371   0.352449  -0.361710  -3.488688  -2.535157 -1.568924   
894  -1.186919   0.554047   0.236214  -3.333672  -2.262145 -1.246151   
896  -1.061639   0.281646   0.695157  -3.466753  -1.929701 -1.005468   

     price_ITA  price_POL  price_SVK  is_spring  is_summer  is_autumn  \
0    -3.931031  -2.308758  -3.238197        1.0        0.0    

# Kernels

In [5]:
from sklearn.gaussian_process.kernels import *

In [6]:
#if you want to try new kernels, add that kernel to this list
kernel_list = [DotProduct() + WhiteKernel(),
               RBF() + WhiteKernel(),
               RBF(length_scale=1.5) + WhiteKernel(),
               Matern() + WhiteKernel(),
               RationalQuadratic() + WhiteKernel()]

# Modeling and Prediction
TODO: Define the model and fit it using training data. Then, use test data to make predictions

In [7]:
"""
This defines the model, fits training data and then does the prediction
with the test data 

Parameters
----------
X_train: matrix of floats, training input with 10 features
y_train: array of floats, training output
X_test: matrix of floats: dim = (100, ?), test input with 10 features

Compute
----------
y_test: array of floats: dim = (100,), predictions on test set
"""

#TODO: Define the model and fit it using training data. Then, use test data to make predictions
y_pred=np.zeros(X_test.shape[0])
n_folds = 10
kf = KFold(n_splits=n_folds)
score_mat = np.zeros((len(kernel_list), n_folds))

X_np_train = X_train.to_numpy()
y_np_train = y_train.to_numpy()

for k in range(len(kernel_list)):
    print(f"training with {k}th kernel: {kernel_list[k]}")
    kernel = kernel_list[k]
    
    f = 0
    for train, test in kf.split(X_np_train):
        gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=5)
        gpr.fit(X_np_train[train], y_np_train[train])
        score_mat[k][f] = gpr.score(X_np_train[test], y_np_train[test])
        f += 1

   
    print(f"mean scores={np.mean(score_mat[k])}")

training with 0th kernel: DotProduct(sigma_0=1) + WhiteKernel(noise_level=1)
mean scores=0.7521618763783592
training with 1th kernel: RBF(length_scale=1) + WhiteKernel(noise_level=1)
mean scores=0.9467600870891089
training with 2th kernel: RBF(length_scale=1.5) + WhiteKernel(noise_level=1)
mean scores=0.9467600917196849
training with 3th kernel: Matern(length_scale=1, nu=1.5) + WhiteKernel(noise_level=1)
mean scores=0.9467050410778904
training with 4th kernel: RationalQuadratic(alpha=1, length_scale=1) + WhiteKernel(noise_level=1)
mean scores=0.9440817064455663


In [8]:
#error score of the validation set prediction result from each kernel method
#Bigger score is better. At most can become 1 == no error 
avg_scores = np.mean(score_mat, axis=1)
print(avg_scores)

[0.75216188 0.94676009 0.94676009 0.94670504 0.94408171]


In [9]:
#choose the optimal kernel
opt_kernel = kernel_list[np.argmax(avg_scores, axis=0)]
print(opt_kernel)

RBF(length_scale=1.5) + WhiteKernel(noise_level=1)


In [10]:
#train with the whole training data set
gpr_opt = GaussianProcessRegressor(kernel=opt_kernel, n_restarts_optimizer=5)
gpr_opt.fit(X_train, y_train)
print(gpr_opt.score(X_train, y_train))
#predict
y_pred = gpr_opt.predict(X_test)
assert y_pred.shape == (100,), "Invalid data shape"

0.9919654425380167


# Saving Results
You don't have to change this

In [11]:
dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results.csv', index=False)
print("\nResults file successfully generated!")


Results file successfully generated!
