In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, make_scorer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold, learning_curve, GridSearchCV, RandomizedSearchCV

In [3]:
# importing dataset
from google.colab import files
uploaded = files.upload()

Saving ML-CUP23-TR.csv to ML-CUP23-TR.csv


In [4]:
# importing dataset
from google.colab import files
uploaded1 = files.upload()

Saving ML-CUP23-TS.csv to ML-CUP23-TS.csv


In [5]:
import io
df_traincup = pd.read_csv(io.BytesIO(uploaded['ML-CUP23-TR.csv']), names=['id', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'target_x', 'target_y', 'target_z'], delimiter=',')

In [6]:
import io
df_testcup = pd.read_csv(io.BytesIO(uploaded1['ML-CUP23-TS.csv']), names=['id', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'target_x', 'target_y', 'target_z'], delimiter=',')

In [None]:
df_traincup

In [8]:
def split_data(data: pd.DataFrame, cols_name_split: list, rows_split_perc=1):
  '''
      It makes the split of the columns passed in \"cols_name_split\" and the split of the rows based on the \
      percentage \"rows_split_perc\".\n
      If \"cols_name_split\" = [] -> the method returns: (data, None).\n
      else -> (data_splitted_x, data_splitted_y) with rows_split_perc=1 \
          or (data_splitted_x_train, data_splitted_y_train, data_splitted_x_val, data_splitted_y_val) with rows_split_perc!=1.\n
      So this method makes split on columns or on rows (or both).\n\n
      Returns a tuple of two new DataFrames: (x,y).\n
      - x: is like \"df\" without the columns specified in the list \"cols_name_split\".\n
      - y: are the columns indentified by the list \"cols_name_split\".\n
      or a tuple of this format (x_train, y_train, x_val, y_val) with:
      - x_train: is like \"df\" without the columns specified in the list \"cols_name_split\".\n
      - y_train: are the columns indentified by the list \"cols_name_split\" used for Training.\n
      - x_val: is like \"df\" without the columns specified in the list \"cols_name_split\".\n
      - y_val: are the columns indentified by the list \"cols_name_split\" used for Validation.\n\n
      The parameters are:\n
      - df: the input DataFrame.\n
      - cols_name_split: list of names of target columns.\n
      - rows_split_perc: percentage of the data to split for Training, and so the 1-rows_split_perc percentage for Validation.
  '''
  # Case of no columns split
  if cols_name_split == []:
      # Case of rows split
      if rows_split_perc != 1:
          return np.split(data, [int(data.shape[0] * rows_split_perc)], axis=0)
      # Case of no rows and no columns split
      else:
          raise ValueError

  # Columns split
  y = data[cols_name_split].copy(deep=True)
  x = data.drop(columns=cols_name_split,axis=1).copy(deep=True)

  # Case of only columns split
  if rows_split_perc == 1:
      return x, y

  # Case of both splits (rows split and columns split)
  else:
      #x_train = x[:int(x.shape[0] * rows_split_perc), :]
      #x_val = x[int(x.shape[0] * rows_split_perc):, :]
      x_train, x_val = np.split(x, [int(x.shape[0] * rows_split_perc)], axis=0)
      #y_train = y[:int(x.shape[0] * rows_split_perc), :]
      #y_val = y[int(x.shape[0] * rows_split_perc):, :]
      y_train, y_val = np.split(y, [int(y.shape[0] * rows_split_perc)], axis=0)
      return x_train, y_train, x_val, y_val

## DATA SPLIT

In [9]:
# Saving the ID columns
df_id_train: pd.DataFrame = df_traincup['id']
df_id_test: pd.DataFrame = df_testcup['id']

# Drop the ID columns
df_train = df_traincup.drop(columns=['id'],axis=1).copy(deep=True)
df_test = df_testcup.drop(columns=['id'],axis=1).copy(deep=True)

# Split of columns and rows (0.8/0.2) into: TR set and Internal TS set
x_train, y_train, x_internal_test, y_internal_test = split_data(
    data=df_train,
    cols_name_split=['target_x','target_y','target_z'],
    rows_split_perc=0.8
)

# Split on columns
x_test, y_test = split_data(data=df_test, cols_name_split=['target_x','target_y','target_z'])

# Print of the shapes
print(f"[IDs TR SET]: " + str(df_id_train.shape))
print(f"[IDs TS SET]: " + str(df_id_test.shape))
print(f"[TR SET - x]: " + str(x_train.shape))
print(f"[TR SET - y]: " + str(y_train.shape))
print(f"[Internal TS SET - x]: " + str(x_internal_test.shape))
print(f"[Internal TS SET - y]: " + str(y_internal_test.shape))
print(f"[TS SET - x]: " + str(x_test.shape))
print(f"[TS SET - y]: " + str(y_test.shape))

[IDs TR SET]: (1000,)
[IDs TS SET]: (900,)
[TR SET - x]: (800, 10)
[TR SET - y]: (800, 3)
[Internal TS SET - x]: (200, 10)
[Internal TS SET - y]: (200, 3)
[TS SET - x]: (900, 10)
[TS SET - y]: (900, 3)


In [10]:
# Define custom loss function
def euclidean_distance_loss(y_true, y_pred):
    return np.sqrt(np.sum(np.square(y_pred - y_true)))

# Define a custom scorer based on the custom loss function
def MEE(y_true, y_pred):
    return np.mean(euclidean_distance_loss(y_true, y_pred))

# Use make_scorer to create a scorer suitable for RandomizedSearchCV
scorer = make_scorer(MEE, greater_is_better=False)

### STANDARDIZATION


In regression problems and for distance based alg. it is customary to normalize the output too, because the scale of output and input features may differ.

If you normalize your data, you will have a cost function which is well behaved. means that you can find local parts more easily. The reason is that you have to construct the output using the input features in regression problems. It is difficult to make large values with small normalized features. Instead with small numbers, making a normalized output, is easier and can be learned faster.



In [11]:
scaler = StandardScaler()

scaler.fit(x_train)
x_train_std = scaler.transform(x_train)
x_train=pd.DataFrame(x_train_std)

In [12]:
scaler.fit(x_internal_test)
x_internal_test_std = scaler.transform(x_internal_test)
x_internal_test = pd.DataFrame(x_internal_test_std)

In [13]:
scaler.fit(y_train)
y_train_std = scaler.transform(y_train)
y_train = pd.DataFrame(y_train_std)

In [14]:
scaler.fit(y_internal_test)
y_internal_test_std = scaler.transform(y_internal_test)
y_internal_test = pd.DataFrame(y_internal_test_std)

## TRAINING OF 3 UNIVARIATE SVR

In [16]:
from sklearn.svm import SVR

In [17]:
# Parameter grid for the Grid Search
param_grid = {'C': [10], 'epsilon': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf'], 'degree':[2], 'gamma': ['scale', 'auto'], 'max_iter': [340]} #no precomputed kernel perch√© ha bisogno di una matrice quadrata (n,n)

### label x

In [18]:
y_train_x = y_train[0]
y_test_x = y_internal_test[0]

In [None]:
y_train_x

In [20]:
grid_SVR_x = GridSearchCV(SVR(),param_grid, verbose=0, cv=5, scoring=make_scorer(mean_squared_error, greater_is_better=False))

In [None]:
grid_SVR_x.fit(x_train,y_train_x)

In [None]:
print("Best parameters obtained by Random Search - label x:", grid_SVR_x.best_params_)

In [23]:
best_regressor_x = grid_SVR_x.best_estimator_

In [24]:
y_train_pred_x = best_regressor_x.predict(x_train)

In [None]:
train_loss = MEE(y_train_x, y_train_pred_x)
print("MEE - label x: ", train_loss)

In [None]:
# Assuming x_val and y_val are your validation data
y_val_pred_x = best_regressor_x.predict(x_internal_test)
val_loss_x = MEE(y_test_x, y_val_pred_x)
print("val loss - label x: ", val_loss_x)

### label y

In [None]:
y_train_y = y_train[1]
y_test_y = y_internal_test[1]

#training
grid_SVR_y = GridSearchCV(SVR(),param_grid,refit=True,verbose=0,cv=5)
grid_SVR_y.fit(x_train,y_train_y)


In [None]:
print("Best parameters obtained by Random Search - label y:", grid_SVR_y.best_params_)
best_regressor_y = grid_SVR_y.best_estimator_
y_train_pred_y = best_regressor_y.predict(x_train)

#training validation
train_loss_y = MEE(y_train_y, y_train_pred_y)
print("MEE - label y: ", train_loss_y)

In [None]:
#internal validation
y_val_pred_y = best_regressor_y.predict(x_internal_test)
val_loss_y = MEE(y_test_y, y_val_pred_y)
print("val loss - label y: ", val_loss_y)

### label z

In [None]:
y_train_z = y_train[2]
y_test_z = y_internal_test[2]

#training
grid_SVR_z = GridSearchCV(SVR(),param_grid,refit=True,verbose=0,cv=5)
grid_SVR_z.fit(x_train,y_train_z)

In [None]:
print("Best parameters obtained by Random Search - label z:", grid_SVR_z.best_params_)
best_regressor_z = grid_SVR_z.best_estimator_
y_train_pred_z = best_regressor_z.predict(x_train)

#training validation
train_loss_z = MEE(y_train_z, y_train_pred_z)
print("MEE - label z: ", train_loss_z)

In [None]:
#internal validation
y_val_pred_z = best_regressor_z.predict(x_internal_test)
val_loss_z = MEE(y_test_z, y_val_pred_z)
print("val loss - label z: ", val_loss_z)

### MEE COMBINED

In [None]:
# Calcolo del Mean Euclidean Error combinato
combined_error = np.mean([val_loss_x, val_loss_y, val_loss_z], axis=0)
mean_euclidean_error = np.mean(combined_error)
combined_error1 = np.mean([train_loss, train_loss_y, train_loss_z], axis=0)

mean_euclidean_error = np.mean(combined_error)
print("Mean Euclidean Error combinato:", mean_euclidean_error)
print("mean euclidean error training combinato:", np.mean(combined_error1))