In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [14]:
def data_loading():
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Parameters
    ----------
    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    print("Training data:")
    print("Shape:", train_df.shape)
    print(train_df.head(10))
    print('\n')
    
    # Load test data
    test_df = pd.read_csv("test.csv")

    print("Test data:")
    print(test_df.shape)
    print(test_df.head(10))

    # Dummy initialization of the X_train, X_test and y_train   
    X_train = np.zeros_like(train_df.drop(['price_CHF'],axis=1))
    y_train = np.zeros_like(train_df['price_CHF'])
    X_test = np.zeros_like(test_df)

    # TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test
    #Encode train and test data
    train_data = pd.DataFrame(train_df).to_numpy()
    test_data = pd.DataFrame(test_df).to_numpy()
    
    enc = OneHotEncoder()
    
    enc.fit(train_data[:,0].reshape(-1,1))
    train_data_enc = enc.transform(train_data[:,0].reshape(-1,1))
    train_data_ui = np.hstack([train_data[:,1:],train_data_enc.toarray()])
                               
    enc.fit(test_data[:,0].reshape(-1,1))
    test_data_enc = enc.transform(test_data[:,0].reshape(-1,1))
    test_data_ui = np.hstack([test_data[:,1:],test_data_enc.toarray()])
    
    #Perform data imputation on train and test data
    imputer = KNNImputer(n_neighbors=5,weights='distance')
    train_data_i = imputer.fit_transform(train_data_ui)
    test_data_i = imputer.fit_transform(test_data_ui)
    
    y_train = train_data_i[:,1]
    X_train = np.hstack([train_data_i[:,:1],train_data_i[:,2:]])
    X_test = test_data_i

    assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test

In [17]:
def modeling_and_prediction(X_train, y_train, X_test):
    """
    This function defines the model, fits training data and then does the prediction with the test data 

    Parameters
    ----------
    X_train: matrix of floats, training input with 10 features
    y_train: array of floats, training output
    X_test: matrix of floats: dim = (100, ?), test input with 10 features

    Returns
    ----------
    y_test: array of floats: dim = (100,), predictions on test set
    """

    y_pred=np.zeros(X_test.shape[0])
    
    #TODO: Define the model and fit it using training data. Then, use test data to make predictions
    param_grid = [{"alpha": [0.1], "kernel": [RBF(l) for l in np.logspace(-1, 1, 2)]}]
    gpr = GaussianProcessRegressor()    
    best = GridSearchCV(gpr, param_grid=param_grid, scoring='r2', cv=9)
    best.fit(X_train, y_train)
    print(best.best_params_)
    
    y_pred = best.predict(X_test)
    #print(r2_score(y_train, y_pred))

    assert y_pred.shape == (100,), "Invalid data shape"
    return y_pred

In [18]:
# Main function. You don't have to change this
if __name__ == "__main__":
    # Data loading
    X_train, y_train, X_test = data_loading()
    # The function retrieving optimal LR parameters
    seed = 123
    y_pred=modeling_and_prediction(X_train, y_train, X_test)
    # Save results in the required format
    dt = pd.DataFrame(y_pred) 
    dt.columns = ['price_CHF']
    dt.to_csv('results.csv', index=False)
    print("\nResults file successfully generated!")

Training data:
Shape: (900, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   
2  autumn  -2.101937   7.620085  -1.910282        NaN  -3.388777        NaN   
3  winter  -2.098475   8.411894  -1.903834        NaN  -3.588235        NaN   
4  spring  -1.969687   8.926884  -1.697257  -1.331049        NaN  -3.911096   
5  summer  -1.935209   8.104719  -1.488434        NaN  -3.878786  -3.831497   
6  autumn  -1.457232   7.002749        NaN  -1.005941  -4.217287  -3.853199   
7  winter        NaN   5.502236  -0.894221        NaN  -4.789651  -3.518414   
8  spring  -1.044688   4.574785        NaN  -0.532294  -4.858778  -2.997490   
9  summer  -0.578858        NaN  -0.119761  -0.149830  -5.180666  -3.097592   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1