In [24]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from libpysal.weights import lat2W
from esda.moran import Moran
import os 


In [25]:
# load data and compute rmse/mae
# load metadata

metadata = pd.read_csv('dataset_info.csv')
metadata.head()


Unnamed: 0,id,relationship,coeficients,trend,surface_level,spatial_autocorrelation,dataset_path,target_path,trend_path
0,0,linear,"[[1.5, 0.25], [1.5, -0.25], [-1.5, 0.25], [-1....",none,small,high,dataset-TR_none_rel-linear-lvl_small-sac-high.csv,target-TR_none_rel-linear-lvl_small-sac-high.csv,trend_none.csv
1,1,linear,"[[1.5, 0.25], [1.5, -0.25], [-1.5, 0.25], [-1....",none,small,low,dataset-TR_none_rel-linear-lvl_small-sac-low.csv,target-TR_none_rel-linear-lvl_small-sac-low.csv,trend_none.csv
2,2,linear,"[[1.5, 0.25], [1.5, -0.25], [-1.5, 0.25], [-1....",none,small,none,dataset-TR_none_rel-linear-lvl_small-sac-none.csv,target-TR_none_rel-linear-lvl_small-sac-none.csv,trend_none.csv
3,3,polynom2,"[[2, -1, 0.25], [-2, -1, -0.25], [-2, 1, -0.25...",none,small,high,dataset-TR_none_rel-polynom2-lvl_small-sac-hig...,target-TR_none_rel-polynom2-lvl_small-sac-high...,trend_none.csv
4,4,polynom2,"[[2, -1, 0.25], [-2, -1, -0.25], [-2, 1, -0.25...",none,small,low,dataset-TR_none_rel-polynom2-lvl_small-sac-low...,target-TR_none_rel-polynom2-lvl_small-sac-low.csv,trend_none.csv


In [26]:
spatial_autocorrelation_val = []

m = 160

# calculate value of spatial autocorrelation 
for row in tqdm(metadata.iterrows(), total=len(metadata)):

    # name of the file
    name = row[1]['target_path']

    # load values of dependent feature
    label = np.genfromtxt(f"data_labels\\{name}", delimiter=',')

    w = lat2W(m, m, rook=False, id_type="int")
    mi = Moran(label.reshape(m, m), w).I
    spatial_autocorrelation_val.append(mi)

HBox(children=(FloatProgress(value=0.0, max=135.0), HTML(value='')))




In [27]:
def difference_percent(old, new):
    old_arr = np.array(old)
    new_arr = np.array(new)
    return list(((old_arr-new_arr)/old_arr)*100)

### LINEAR REGRESSION

In [40]:
# LINEAR REGRESION
lr_coord_rmse = []
lr_no_coord_rmse = []
glr_rmse = []
msglr_rmse = []
lr_bandwidth = []

In [41]:
for row in tqdm(metadata.iterrows(), total=len(metadata)):

    id_dataset = row[1]['id']

    # load dataset
    features =  np.genfromtxt(f"data_features\\{row[1]['dataset_path']}", delimiter=',')
    labels = np.genfromtxt(f"data_labels\\{row[1]['target_path']}", delimiter=',')

    # split the dataset
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    c_test = X_test[:, -2:]

    limits_ind = [20,120,20,120]
    limits_indx = np.where((c_test[:,0] > limits_ind[0]) & (c_test[:,0] < limits_ind[1])
                            & (c_test[:,1] > limits_ind[2]) & (c_test[:,1] < limits_ind[3]))[0]
    
    y_test = y_test[limits_indx]

    # LINEAR REGRESSION
    # load prediction from coord LR
    pred_lr_coord = np.genfromtxt(f"testingLR\\LR_coord\\LR_coord_id_{id_dataset}.csv", delimiter=',')

    # load prediction from no coord LR
    pred_lr_no_coord = np.genfromtxt(f"testingLR\\LR\\LR_id_{id_dataset}.csv", delimiter=',')

    # load prediction from GLR
    pred_glr = np.genfromtxt(f"testingLR\\GLR\\GLR_id_{id_dataset}.csv", delimiter=',')

    # load prediction from MSGLR 
    pred_msglr = np.genfromtxt(f"testingLR\\MSGLR\\MSGLR_id_{id_dataset}.csv", delimiter=',')

    # calculate rmse
    rmse_lr_coord = mean_squared_error(y_test, pred_lr_coord, squared=False)
    rmse_lr_no_coord = mean_squared_error(y_test, pred_lr_no_coord, squared=False)
    rmse_glr = mean_squared_error(y_test, pred_glr, squared=False)
    rmse_msglr = mean_squared_error(y_test, pred_msglr, squared=False)

    # get single bandwidth
    bandwidth = np.genfromtxt(f"tuningLR\\single_bandwidth\\single_bandwidth_id_{id_dataset}.csv", delimiter=',')

    # append to list
    lr_coord_rmse.append(rmse_lr_coord)
    lr_no_coord_rmse.append(rmse_lr_no_coord)
    glr_rmse.append(rmse_glr)
    msglr_rmse.append(rmse_msglr)
    lr_bandwidth.append(bandwidth)

HBox(children=(FloatProgress(value=0.0, max=135.0), HTML(value='')))




In [53]:


result_dataLR = metadata[['relationship','trend', 'surface_level', 'spatial_autocorrelation', ]]
result_dataLR['algorithm'] = ['LinearRegression'] * len(result_dataLR)
result_dataLR['spatial_autocorrelation_val'] = spatial_autocorrelation_val
result_dataLR['bandwidth'] = lr_bandwidth

result_dataLR['Coordinates'] = lr_coord_rmse
result_dataLR['No_coordinates'] = lr_no_coord_rmse
result_dataLR['Add_coordinates_change'] = difference_percent(lr_no_coord_rmse, lr_coord_rmse)
result_dataLR['Geographical'] = glr_rmse
result_dataLR['Geographical_change'] = difference_percent(lr_no_coord_rmse, glr_rmse)
result_dataLR['Multiscale'] = msglr_rmse
result_dataLR['Multiscale_change'] = difference_percent(lr_no_coord_rmse, msglr_rmse)

result_dataLR.to_csv("resultLR.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dataLR['algorithm'] = ['LinearRegression'] * len(result_dataLR)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dataLR['spatial_autocorrelation_val'] = spatial_autocorrelation_val
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dataLR['bandwidth'] = lr_bandwidth
A value is trying

In [8]:
result_data = metadata[['relationship','trend', 'surface_level', 'spatial_autocorrelation', ]]
len(result_data)

135

### RANDOM FOREST

In [43]:
# RANDOM FOREST
rf_coord_rmse = []
rf_no_coord_rmse = []
grf_rmse = []
msgrf_rmse = []

rf_bandwidth = []

In [44]:
for row in tqdm(metadata.iterrows(), total=len(metadata)):

    id_dataset = row[1]['id']

    # load dataset
    features =  np.genfromtxt(f"data_features\\{row[1]['dataset_path']}", delimiter=',')
    labels = np.genfromtxt(f"data_labels\\{row[1]['target_path']}", delimiter=',')

    # split the dataset
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    c_test = X_test[:, -2:]

    limits_ind = [20,120,20,120]
    limits_indx = np.where((c_test[:,0] > limits_ind[0]) & (c_test[:,0] < limits_ind[1])
                            & (c_test[:,1] > limits_ind[2]) & (c_test[:,1] < limits_ind[3]))[0]
    
    y_test = y_test[limits_indx]

    # RANDOM FOREST
    # load prediction from coord RF
    pred_rf_coord = np.genfromtxt(f"testingRF\\RF_coord\\RF_coord_id_{id_dataset}.csv", delimiter=',')

    # load prediction from no coord RF
    pred_rf_no_coord = np.genfromtxt(f"testingRF\\RF\\RF_id_{id_dataset}.csv", delimiter=',')

    # load prediction from GRF
    pred_grf = np.genfromtxt(f"testingRF\\GRF\\GRF_id_{id_dataset}.csv", delimiter=',')

    # load prediction from MSGRF 
    pred_msgrf = np.genfromtxt(f"testingRF\\MSGRF\\MSGRF_id_{id_dataset}.csv", delimiter=',')

    # get single bandwidth
    bandwidth = np.genfromtxt(f"tuningRF\\single_bandwidth\\single_bandwidth_id_{id_dataset}.csv", delimiter=',')

    # calculate rmse
    rmse_rf_coord = mean_squared_error(y_test, pred_rf_coord, squared=False)
    rmse_rf_no_coord = mean_squared_error(y_test, pred_rf_no_coord, squared=False)
    rmse_grf = mean_squared_error(y_test, pred_grf, squared=False)
    rmse_msgrf = mean_squared_error(y_test, pred_msgrf, squared=False)

    rf_bandwidth.append(bandwidth)

    # append to list
    rf_coord_rmse.append(rmse_rf_coord)
    rf_no_coord_rmse.append(rmse_rf_no_coord)
    grf_rmse.append(rmse_grf)
    msgrf_rmse.append(rmse_msgrf)

HBox(children=(FloatProgress(value=0.0, max=135.0), HTML(value='')))




In [52]:
result_dataRF = metadata[['relationship','trend', 'surface_level', 'spatial_autocorrelation', ]]
result_dataRF['algorithm'] = ['RandomForest'] * len(result_data)
result_dataRF['spatial_autocorrelation_val'] = spatial_autocorrelation_val
result_dataRF['bandwidth'] = rf_bandwidth


result_dataRF['Coordinates'] = rf_coord_rmse
result_dataRF['No_coordinates'] = rf_no_coord_rmse
result_dataRF['Add_coordinates_change'] = difference_percent(rf_no_coord_rmse, rf_coord_rmse)
result_dataRF['Geographical'] = grf_rmse
result_dataRF['Geographical_change'] = difference_percent(rf_no_coord_rmse, grf_rmse)
result_dataRF['Multiscale'] = msgrf_rmse
result_dataRF['Multiscale_change'] = difference_percent(rf_no_coord_rmse, msgrf_rmse)

result_dataRF.to_csv("resultRF.csv")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dataRF['algorithm'] = ['RandomForest'] * len(result_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dataRF['spatial_autocorrelation_val'] = spatial_autocorrelation_val
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dataRF['bandwidth'] = rf_bandwidth
A value is trying to be

### SUPPORT VECTOR MACHINES

In [46]:
# SVM
svm_coord_rmse = []
svm_no_coord_rmse = []
gsvm_rmse = []
msgsvm_rmse = []
svm_bandwidth = []

In [47]:
for row in tqdm(metadata.iterrows(), total=len(metadata)):

    id_dataset = row[1]['id']

    # load dataset
    features =  np.genfromtxt(f"data_features\\{row[1]['dataset_path']}", delimiter=',')
    labels = np.genfromtxt(f"data_labels\\{row[1]['target_path']}", delimiter=',')

    # split the dataset
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    c_test = X_test[:, -2:]

    limits_ind = [20,120,20,120]
    limits_indx = np.where((c_test[:,0] > limits_ind[0]) & (c_test[:,0] < limits_ind[1])
                            & (c_test[:,1] > limits_ind[2]) & (c_test[:,1] < limits_ind[3]))[0]
    
    y_test = y_test[limits_indx]


    X_train_split, dummy_x, y_train_split, dummy_y = train_test_split(X_train, y_train, test_size=0.6, random_state=42)
    X_train_global, X_test_global, y_train_global, y_test_global = train_test_split(X_train_split, y_train_split, test_size=0.2, random_state=42)
    

    # SUPPORT VECTOR MACHINES
    # load prediction from coord SVM
    pred_svm_coord = np.genfromtxt(f"testingSVM\\SVM_coord\\SVM_coord_id_{id_dataset}.csv", delimiter=',')

    # load prediction from no coord SVM
    pred_svm_no_coord = np.genfromtxt(f"testingSVM\\SVM\\SVM_id_{id_dataset}.csv", delimiter=',')
    
    # load prediction from GSVM
    pred_gsvm = np.genfromtxt(f"testingSVM\\GSVM\\GSVM_id_{id_dataset}.csv", delimiter=',')

    # load prediction from MSGSVM 
    pred_msgsvm = np.genfromtxt(f"testingSVM\\MSGSVM\\MSGSVM_id_{id_dataset}.csv", delimiter=',')

    # get single bandwidth
    bandwidth = np.genfromtxt(f"tuningSVM\\single_bandwidth\\single_bandwidth_id_{id_dataset}.csv", delimiter=',')

    # calculate rmse
    rmse_svm_coord = mean_squared_error(y_test_global, pred_svm_coord, squared=False)
    rmse_svm_no_coord = mean_squared_error(y_test_global, pred_svm_no_coord, squared=False)
    rmse_gsvm = mean_squared_error(y_test, pred_gsvm, squared=False)
    rmse_msgsvm = mean_squared_error(y_test, pred_msgsvm, squared=False)


    # append to list
    svm_coord_rmse.append(rmse_svm_coord)
    svm_no_coord_rmse.append(rmse_svm_no_coord)
    gsvm_rmse.append(rmse_gsvm)
    msgsvm_rmse.append(rmse_msgsvm)
    svm_bandwidth.append(bandwidth)


HBox(children=(FloatProgress(value=0.0, max=135.0), HTML(value='')))




In [51]:
result_dataSVM = metadata[['relationship','trend', 'surface_level', 'spatial_autocorrelation', ]]
result_dataSVM['algorithm'] = ['SupportVectorMachines'] * len(result_data)
result_dataSVM['spatial_autocorrelation_val'] = spatial_autocorrelation_val
result_dataSVM['bandwidth'] = svm_bandwidth


result_dataSVM['Coordinates'] = svm_coord_rmse
result_dataSVM['No_coordinates'] = svm_no_coord_rmse
result_dataSVM['Add_coordinates_change'] = difference_percent(svm_no_coord_rmse, svm_coord_rmse)
result_dataSVM['Geographical'] = gsvm_rmse
result_dataSVM['Geographical_change'] = difference_percent(svm_no_coord_rmse, gsvm_rmse)
result_dataSVM['Multiscale'] = msgsvm_rmse
result_dataSVM['Multiscale_change'] = difference_percent(svm_no_coord_rmse, msgsvm_rmse)

result_dataSVM.to_csv("resultSVM.csv")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dataSVM['algorithm'] = ['SupportVectorMachines'] * len(result_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dataSVM['spatial_autocorrelation_val'] = spatial_autocorrelation_val
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dataSVM['bandwidth'] = svm_bandwidth
A value is

In [50]:
['SupportVectorMachines'] * len(result_data)

['SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVectorMachines',
 'SupportVec