In [1]:
import numpy as np
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic



In [2]:
# Load training data
train_df = pd.read_csv("train.csv")
    
print("Training data:")
print("Shape:", train_df.shape)
print(train_df.head(2))
print('\n')
    
# Load test data
test_df = pd.read_csv("test.csv")

print("Test data:")
print("Shape:", test_df.shape)
print(test_df.head(2))

#Checking how many rows of training set and test set are complete/without missing values
#eventually all rows have missing values, so dropping lines with NaNs is not an option
len(train_df), len(train_df.dropna())
len(test_df), len(test_df.dropna())

# Training set wrangling

# take all numerical features of training set by dropping categorical data/season column and also remove price_CHF column
# Drop columns 0 and 2 from the DataFrame
x_train = train_df.drop(train_df.columns[[0, 2]], axis=1)
# take the categorical data/season from the training set as a separate dataset  
x_train_ssn = train_df.iloc[:, 0]
# take price_CHF values separately
y_train = train_df.iloc[:, 2]
print("x_train")
display(x_train)
print("x_train_ssn")
display(x_train_ssn)
print("y_train")
display(y_train)

#Test set wrangling

##take all numerical features of training set by dropping categorical data/season column
x_test = test_df.drop(test_df.columns[[0]], axis=1)
## take the categorical data/season from the training set as a separate dataset  
x_test_ssn = test_df.iloc[:,0]
print("x_test")
display(x_test)
print("x_test_ssn")
display(x_test_ssn)

#Imputation of missing values in test and training set

##Creation of a unique dataset with train and test datasets for consistent imputation across train and test datasets.
unique_df = pd.concat([x_train, x_test], axis=0)
print("unique_df")
display(unique_df)

## Multivariate Feature Imputation method 
imp = IterativeImputer(max_iter=10, random_state=0, missing_values=np.nan)  # Corrected 'imputation_order' ->>>>>>>imputation_order='ascending'
##fit imputer to the merged features dataset
imp.fit(unique_df)
##separately apply imputation to missing values of training and test set 
x_train_imp = pd.DataFrame(imp.transform(x_train))
x_train_imp.columns = unique_df.columns  # add back the same column names 
x_test_imp = pd.DataFrame(imp.transform(x_test))
x_test_imp.columns =unique_df.columns 
print("x_train_imp")
display(x_train_imp)
print("x_test_imp")
display(x_test_imp)

#Encode categorical data/season -> When not active the final result is higher for some reason

#Enc = OneHotEncoder()
x_train_ssn_enc = Enc.fit_transform(x_train_ssn.values.reshape(-1,1)) 
x_test_ssn_enc = Enc.transform(x_test_ssn.values.reshape(-1,1))  
# Convert encoded features back to DataFrame
x_train_ssn_enc = pd.DataFrame(x_train_ssn_enc.toarray(), columns=Enc.get_feature_names_out(['season']))
x_test_ssn_enc = pd.DataFrame(x_test_ssn_enc.toarray(), columns=Enc.get_feature_names_out(['season']))

## Alternative of Encoding categorical data/season without OneHotEncoder
#x_train_ssn_enc = pd.get_dummies(x_train_ssn)
#x_train_ssn_enc = x_train_ssn_enc.astype(int)
#x_test_ssn_enc = pd.get_dummies(x_test_ssn)
#x_test_ssn_enc = x_test_ssn_enc.astype(int)

# Merge encoded categorical with numerical features on test and training set after the imputation of the latter
x_train_imp = pd.concat([x_train_imp, x_train_ssn_enc], axis=1)
x_test_imp = pd.concat([x_test_imp, x_test_ssn_enc], axis =1)

#Remove rows with initially missing price_CHF values from training set, which now are artificially created/ imputed instead of NaN 

##check how many rows from price_CHF columns in train test are missing 
print("rows training set = ", len(y_train))
print( "rows with price_CHF not missing in training set = ", len(y_train.dropna()))
## so it is possible to keep 631 rows out of 900 in training data set, to train our model with the imputed features 

# Find row indexes where NaN values are present in the "price_CHF" column
nan_rows = y_train.isnull()

# Filter out rows with NaN values in the "price_CHF" column
x_train_imp = x_train_imp[~nan_rows]
x_train_ssn = x_train_ssn[~nan_rows]
y_train_imp = y_train[~nan_rows]

print("x_train_imp")
display(x_train_imp)
print("x_train_ssn")
display(x_train_ssn)
print("y_train_imp")
display(y_train_imp)


Training data:
Shape: (900, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  


Test data:
Shape: (100, 10)
   season  price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK  \
0  spring        NaN   0.472985   0.707957        NaN  -1.136441 -0.596703   
1  summer  -1.184837   0.358019        NaN  -3.199028  -1.069695       NaN   

   price_ITA  price_POL  price_SVK  
0        NaN   3.298693   1.921886  
1  -1.420091   3.238307        NaN  
x_train


Unnamed: 0,price_AUS,price_CZE,price_GER,price_ESP,price_FRA,price_UK,price_ITA,price_POL,price_SVK
0,,-1.686248,-1.748076,-3.666005,,-1.822720,-3.931031,,-3.238197
1,,-2.132377,-2.054363,-3.295697,-4.104759,-1.826021,,,-3.212894
2,-2.101937,-1.910282,,-3.388777,,-2.034409,-4.073850,,-3.114061
3,-2.098475,-1.903834,,-3.588235,,-2.214720,-4.018620,-2.330803,
4,-1.969687,-1.697257,-1.331049,,-3.911096,-2.388092,-4.093946,,
...,...,...,...,...,...,...,...,...,...
895,-1.044730,0.190134,0.253153,-3.443941,-1.967611,,-1.838593,3.235645,
896,-1.061639,0.281646,,-3.466753,-1.929701,,-1.508756,3.187263,
897,-0.971157,0.245279,0.558474,,,-0.843659,-1.499361,3.110638,2.230253
898,,0.299911,0.788152,-3.339650,,-0.865169,-1.547716,3.105417,1.989140


x_train_ssn


0      spring
1      summer
2      autumn
3      winter
4      spring
        ...  
895    winter
896    spring
897    summer
898    autumn
899    winter
Name: season, Length: 900, dtype: object

y_train


0      9.644028
1      7.246061
2      7.620085
3      8.411894
4      8.926884
         ...   
895         NaN
896   -1.734754
897         NaN
898         NaN
899         NaN
Name: price_CHF, Length: 900, dtype: float64

x_test


Unnamed: 0,price_AUS,price_CZE,price_GER,price_ESP,price_FRA,price_UK,price_ITA,price_POL,price_SVK
0,,0.472985,0.707957,,-1.136441,-0.596703,,3.298693,1.921886
1,-1.184837,0.358019,,-3.199028,-1.069695,,-1.420091,3.238307,
2,-1.116459,,0.780460,-3.338948,-1.053149,-0.586339,,3.207398,2.020570
3,,0.353066,0.833429,,-1.322626,-0.628873,-1.304240,3.159858,
4,,0.269644,,-3.245495,-1.362051,-0.717914,-1.341538,3.205007,
...,...,...,...,...,...,...,...,...,...
95,-2.030894,,-1.332104,-3.838154,,-2.319565,,-2.134084,-2.880557
96,-1.817763,,,-3.995247,-3.760752,,-3.903510,-2.269992,-3.007311
97,,,-1.140538,-4.084448,-3.968988,-2.555133,-4.058773,-2.146487,-2.816678
98,,-1.029762,,-4.216490,-3.705548,-2.487751,-3.745480,,-2.849707


x_test_ssn


0     spring
1     summer
2     autumn
3     winter
4     spring
       ...  
95    winter
96    spring
97    summer
98    autumn
99    winter
Name: season, Length: 100, dtype: object

unique_df


Unnamed: 0,price_AUS,price_CZE,price_GER,price_ESP,price_FRA,price_UK,price_ITA,price_POL,price_SVK
0,,-1.686248,-1.748076,-3.666005,,-1.822720,-3.931031,,-3.238197
1,,-2.132377,-2.054363,-3.295697,-4.104759,-1.826021,,,-3.212894
2,-2.101937,-1.910282,,-3.388777,,-2.034409,-4.073850,,-3.114061
3,-2.098475,-1.903834,,-3.588235,,-2.214720,-4.018620,-2.330803,
4,-1.969687,-1.697257,-1.331049,,-3.911096,-2.388092,-4.093946,,
...,...,...,...,...,...,...,...,...,...
95,-2.030894,,-1.332104,-3.838154,,-2.319565,,-2.134084,-2.880557
96,-1.817763,,,-3.995247,-3.760752,,-3.903510,-2.269992,-3.007311
97,,,-1.140538,-4.084448,-3.968988,-2.555133,-4.058773,-2.146487,-2.816678
98,,-1.029762,,-4.216490,-3.705548,-2.487751,-3.745480,,-2.849707


x_train_imp


Unnamed: 0,price_AUS,price_CZE,price_GER,price_ESP,price_FRA,price_UK,price_ITA,price_POL,price_SVK
0,-1.891264,-1.686248,-1.748076,-3.666005,-4.060422,-1.822720,-3.931031,-2.505142,-3.238197
1,-2.232157,-2.132377,-2.054363,-3.295697,-4.104759,-1.826021,-4.037706,-2.461230,-3.212894
2,-2.101937,-1.910282,-1.833128,-3.388777,-4.061264,-2.034409,-4.073850,-2.348069,-3.114061
3,-2.098475,-1.903834,-1.815169,-3.588235,-4.017624,-2.214720,-4.018620,-2.330803,-3.049862
4,-1.969687,-1.697257,-1.331049,-3.775858,-3.911096,-2.388092,-4.093946,-2.308352,-2.892263
...,...,...,...,...,...,...,...,...,...
895,-1.044730,0.190134,0.253153,-3.443941,-1.967611,-0.532369,-1.838593,3.235645,2.683122
896,-1.061639,0.281646,0.217468,-3.466753,-1.929701,0.052860,-1.508756,3.187263,2.633929
897,-0.971157,0.245279,0.558474,-3.326909,-1.410075,-0.843659,-1.499361,3.110638,2.230253
898,-1.196513,0.299911,0.788152,-3.339650,-1.350159,-0.865169,-1.547716,3.105417,1.989140


x_test_imp


Unnamed: 0,price_AUS,price_CZE,price_GER,price_ESP,price_FRA,price_UK,price_ITA,price_POL,price_SVK
0,-1.031766,0.472985,0.707957,-3.527871,-1.136441,-0.596703,-1.207229,3.298693,1.921886
1,-1.184837,0.358019,0.984964,-3.199028,-1.069695,-1.362324,-1.420091,3.238307,2.039894
2,-1.116459,0.285384,0.780460,-3.338948,-1.053149,-0.586339,-1.174725,3.207398,2.020570
3,-1.040747,0.353066,0.833429,-3.798401,-1.322626,-0.628873,-1.304240,3.159858,2.423270
4,-0.988763,0.269644,0.526651,-3.245495,-1.362051,-0.717914,-1.341538,3.205007,2.347385
...,...,...,...,...,...,...,...,...,...
95,-2.030894,-1.686090,-1.332104,-3.838154,-3.760017,-2.319565,-3.930181,-2.134084,-2.880557
96,-1.817763,-1.640359,-1.386965,-3.995247,-3.760752,-2.148158,-3.903510,-2.269992,-3.007311
97,-1.584813,-1.121007,-1.140538,-4.084448,-3.968988,-2.555133,-4.058773,-2.146487,-2.816678
98,-1.334022,-1.029762,-1.103806,-4.216490,-3.705548,-2.487751,-3.745480,-2.166950,-2.849707


rows training set =  900
rows with price_CHF not missing in training set =  631
x_train_imp


Unnamed: 0,price_AUS,price_CZE,price_GER,price_ESP,price_FRA,price_UK,price_ITA,price_POL,price_SVK
0,-1.891264,-1.686248,-1.748076,-3.666005,-4.060422,-1.822720,-3.931031,-2.505142,-3.238197
1,-2.232157,-2.132377,-2.054363,-3.295697,-4.104759,-1.826021,-4.037706,-2.461230,-3.212894
2,-2.101937,-1.910282,-1.833128,-3.388777,-4.061264,-2.034409,-4.073850,-2.348069,-3.114061
3,-2.098475,-1.903834,-1.815169,-3.588235,-4.017624,-2.214720,-4.018620,-2.330803,-3.049862
4,-1.969687,-1.697257,-1.331049,-3.775858,-3.911096,-2.388092,-4.093946,-2.308352,-2.892263
...,...,...,...,...,...,...,...,...,...
889,-0.925984,0.235917,-0.306331,-3.276257,-2.735177,-1.839910,-2.378179,2.535228,2.382208
890,-1.079859,0.108260,-0.226806,-2.857816,-2.460508,-1.807515,-2.223057,2.555153,2.189913
891,-0.961371,0.352449,0.261008,-3.488688,-2.221015,-1.568924,-1.952208,2.706716,2.520366
894,-1.186919,0.006564,0.236214,-3.333672,-2.262145,-1.246151,-1.801203,0.886538,0.701656


x_train_ssn


0      spring
1      summer
2      autumn
3      winter
4      spring
        ...  
889    summer
890    autumn
891    winter
894    autumn
896    spring
Name: season, Length: 631, dtype: object

y_train_imp


0      9.644028
1      7.246061
2      7.620085
3      8.411894
4      8.926884
         ...   
889    3.237347
890    2.679221
891    3.633928
894   -0.303802
896   -1.734754
Name: price_CHF, Length: 631, dtype: float64

In [3]:
#RationalQuadratic
#Initialize and fit GaussianProcessRegressor
gpr_RQ = GaussianProcessRegressor(kernel=RationalQuadratic()).fit(x_train_imp, y_train_imp)

# Predict on test data
y_pred = gpr_RQ.predict(x_test_imp)

display(y_pred)
dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results.csv', index=False)






























































































































































































































array([-2.83812159, -2.3297018 , -3.00604194, -2.38720209, -1.97271534,
       -1.20138338,  0.08544233, -0.77755659,  1.24157509,  1.86234514,
        1.74399109,  2.61723046,  2.52063281,  3.24485566,  2.78687894,
        1.7983837 ,  1.53846039,  1.44736081,  2.4726432 ,  1.13022407,
        3.88262078,  3.81913655,  3.3405451 ,  2.74501478,  3.14602268,
        4.44073379,  5.73141761,  8.38158465,  8.67216363,  9.13787733,
        8.57130413,  8.02085763,  7.86173861,  7.35971333,  8.47707985,
        7.5988337 ,  6.99384987,  7.37298415,  7.92928793,  7.37936677,
        7.96492893,  7.65241793,  7.14296837,  8.28156007,  8.08936688,
        7.42590672,  7.38387605,  7.95249766,  7.55036554,  7.89839862,
        8.41765813,  8.66308201,  8.35423772,  8.23315726,  7.61751837,
        9.19063207,  8.30572295,  8.04768442,  7.71540144,  7.08669526,
        6.93562477,  6.06117465,  5.70674019,  4.92276173,  5.20456482,
        5.07022742,  4.84285902,  4.50830796,  4.4546989 ,  4.65

In [4]:
#RBF
#Initialize and fit GaussianProcessRegressor
gpr_RBF = GaussianProcessRegressor(kernel=RBF(length_scale=0.1)).fit(x_train_imp, y_train_imp)

# Predict on test data
y_pred = gpr_RBF.predict(x_test_imp)

display(y_pred)

dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results_trial_RBF.csv', index=False)


















































































































































































array([-2.63104749e+00, -1.86655416e+00, -3.07817105e+00, -1.30683615e+00,
       -1.21564817e+00, -7.89556760e-01, -3.22829798e-03, -5.42199680e-01,
        6.41144446e-01,  8.52684635e-01,  1.31364193e+00,  1.26324704e+00,
        5.58511374e-01,  2.61832876e+00,  2.67447962e+00,  1.40571322e+00,
        1.49392215e+00,  1.47183612e+00,  2.21115907e+00,  4.98084981e-01,
        3.73227406e+00,  2.68261937e+00,  3.06483817e+00,  2.57587860e+00,
        8.76359391e-01,  4.66387486e+00,  6.82204607e+00,  8.38634090e+00,
        9.01662335e+00,  8.77225553e+00,  8.13811474e+00,  8.64487103e+00,
        8.16806718e+00,  7.25874285e+00,  8.37435894e+00,  7.04880685e+00,
        6.56001789e+00,  7.23507468e+00,  7.15190540e+00,  8.13094820e+00,
        8.00248654e+00,  7.75513716e+00,  6.24170868e+00,  8.41898651e+00,
        8.78673396e+00,  7.30342857e+00,  7.20763406e+00,  6.85359380e+00,
        7.94533570e+00,  7.70326574e+00,  7.32797961e+00,  8.94611150e+00,
        9.11640743e+00,  

In [5]:
#DotProduct
# Initialize and fit GaussianProcessRegressor
gpr_dot = GaussianProcessRegressor(kernel=DotProduct()).fit(x_train_imp, y_train_imp)

# Predict on test data
y_pred = gpr_dot.predict(x_test_imp)

display(y_pred)

dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results_trial_DotProduct.csv', index=False)























































array([-0.56201172, -0.47314453, -0.70947266, -0.65625   , -0.32421875,
        0.07080078,  1.50585938, -0.22363281,  0.92431641,  1.06835938,
        1.11425781,  1.63574219,  1.71044922,  2.74414062,  2.37011719,
        2.14599609,  2.06591797,  1.90478516,  2.10644531,  1.19433594,
        2.28955078,  3.08154297,  3.32910156,  4.54150391,  5.45214844,
        5.93017578,  6.48242188,  7.48583984,  7.76513672,  8.11230469,
        8.35595703,  8.01611328,  8.11279297,  7.94287109,  8.26513672,
        7.93994141,  8.11132812,  7.71923828,  7.63330078,  7.37255859,
        7.72949219,  7.17626953,  7.70019531,  7.87646484,  7.92041016,
        7.94384766,  7.97021484,  8.04980469,  7.96728516,  7.94628906,
        8.94726562,  8.33007812,  8.03808594,  8.1640625 ,  8.24511719,
        7.71533203,  7.77539062,  7.26367188,  6.85644531,  6.7265625 ,
        6.828125  ,  6.96337891,  6.76074219,  5.68359375,  6.09326172,
        6.17773438,  6.22949219,  6.19824219,  6.05712891,  5.93

In [6]:
#Matern
#Initialize and fit GaussianProcessRegressor
gpr_matern = GaussianProcessRegressor(kernel=Matern()).fit(x_train_imp, y_train_imp)

# Predict on test data
y_pred = gpr_matern.predict(x_test_imp)

display(y_pred)
dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results_trial_Matern.csv', index=False)







































































array([-2.94728269, -2.59136842, -3.16913179, -2.49765717, -2.12221286,
       -1.3545802 , -0.10826798, -0.84954904,  1.56938687,  2.14054892,
        2.09510578,  2.77882935,  2.14556236,  3.06382339,  2.75444471,
        1.72253885,  1.49677165,  1.41412462,  2.4339859 ,  1.31278959,
        4.14182827,  3.67876554,  3.27572436,  2.65854624,  2.27843493,
        4.47652003,  5.81508882,  8.67760921,  8.73712709,  9.32702679,
        8.50597553,  8.09684156,  7.86523195,  7.20074152,  8.5544517 ,
        7.57188762,  6.78191599,  7.29431737,  8.09193534,  7.48327792,
        8.12254443,  7.6616648 ,  7.02714892,  8.43889719,  8.35072682,
        7.36194873,  7.25184369,  7.67328353,  7.52100016,  7.84991639,
        8.25374712,  8.68121281,  8.48921304,  8.13124883,  7.23126369,
        9.28654286,  8.27878841,  7.97756018,  7.54915756,  7.37846109,
        7.30730299,  6.15935636,  5.37025042,  3.65732151,  4.69258717,
        5.22142923,  4.81331533,  4.76914447,  4.64283498,  4.54





# Remarks 
1) Between the optimization of these 4 kernels the Rational Quadratic provides the best outcome.
2) Interestingly, without the encoding of the categorical variable the result is higher, something that i couldn't figure out why it happens. Hence, even if in that code the encoding part is active, the uploaded results are with the encoding deactivated 