In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from tqdm.notebook import tqdm

%run GeoModel.ipynb

In [3]:
def scaler(data, min_v, max_v):

    return min_v+(((data-data.min())*(max_v-min_v))/(data.max()-data.min()))


### Support vector machines

#### Tuning

In [4]:
from sklearn.svm import SVR

metadata = pd.read_csv('dataset_info.csv')
metadata.head()

Unnamed: 0,id,relationship,coeficients,trend,surface_level,spatial_autocorrelation,dataset_path,target_path,trend_path
0,0,linear,"[[1.5, 0.25], [1.5, -0.25], [-1.5, 0.25], [-1....",none,small,high,dataset-TR_none_rel-linear-lvl_small-sac-high.csv,target-TR_none_rel-linear-lvl_small-sac-high.csv,trend_none.csv
1,1,linear,"[[1.5, 0.25], [1.5, -0.25], [-1.5, 0.25], [-1....",none,small,low,dataset-TR_none_rel-linear-lvl_small-sac-low.csv,target-TR_none_rel-linear-lvl_small-sac-low.csv,trend_none.csv
2,2,linear,"[[1.5, 0.25], [1.5, -0.25], [-1.5, 0.25], [-1....",none,small,none,dataset-TR_none_rel-linear-lvl_small-sac-none.csv,target-TR_none_rel-linear-lvl_small-sac-none.csv,trend_none.csv
3,3,polynom2,"[[2, -1, 0.25], [-2, -1, -0.25], [-2, 1, -0.25...",none,small,high,dataset-TR_none_rel-polynom2-lvl_small-sac-hig...,target-TR_none_rel-polynom2-lvl_small-sac-high...,trend_none.csv
4,4,polynom2,"[[2, -1, 0.25], [-2, -1, -0.25], [-2, 1, -0.25...",none,small,low,dataset-TR_none_rel-polynom2-lvl_small-sac-low...,target-TR_none_rel-polynom2-lvl_small-sac-low.csv,trend_none.csv


In [5]:

from sklearn.svm import SVR

SVM_param_to_test = {'kernel':['linear', 'rbf'], 'C':[0.01, 0.1, 1, 10, 100]}

In [5]:
%run GeoModel.ipynb

for row in metadata.iloc[126:].iterrows():

    id_dataset = row[1]['id']

    features =  np.genfromtxt(f"data_features\\{row[1]['dataset_path']}", delimiter=',')
    labels = np.genfromtxt(f"data_labels\\{row[1]['target_path']}", delimiter=',')

    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    c_train = X_train[:, -2:]
    c_test = X_test[:, -2:]

    X_train = X_train[:, :-2]
    X_test = X_test[:, :-2]


    # smaller sampling set for tuning of SVR parameters 
    X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(features, labels, test_size=0.8, random_state=42)

    c_train_t = X_train_t[:, -2:]
    c_test_t = X_test_t[:, -2:]

    X_train_t = X_train_t[:, :-2]
    X_test_t = X_test_t[:, :-2]

    # without coordinates
    SVM = SVR()
    clf = GridSearchCV(SVM, SVM_param_to_test)
    clf.fit(X_train_t, y_train_t)
    best_param_nc = clf.best_params_

    # with coordinates
    c_train_t_scaled = scaler(c_train_t, -1, 1)
    X_train_t_coord = np.hstack((X_train_t, c_train_t_scaled))

    SVM = SVR()
    clf = GridSearchCV(SVM, SVM_param_to_test)
    clf.fit(X_train_t_coord, y_train_t)
    best_param_c = clf.best_params_

    GSVM = GeographicalModel('SVR', best_param_nc, 'fixed', 'gaussian', [200])
    single_bandwidth, best_bandwidth_arr = GSVM.tune(c_train, X_train, y_train, -2, [5,10,15,20,25,30, 35, 40], step=3, limits=True, limits_ind=[20,120,20,120])

    single_bandwidth_arr = np.array([single_bandwidth])

    np.savetxt(f"tuningSVM\\single_bandwidth\\single_bandwidth_id_{id_dataset}.csv", single_bandwidth_arr, delimiter=",")
    np.savetxt(f"tuningSVM\\bandwidth\\bandwidth_id_{id_dataset}.csv", best_bandwidth_arr, delimiter=",")

    with open(f'tuningSVM\\paramSVM\\params_id_{id_dataset}.txt', 'w') as f:
        print(best_param_nc, file=f)
    
    with open(f'tuningSVM\\paramSVM_coord\\params_id_{id_dataset}.txt', 'w') as f:
        print(best_param_c, file=f)




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




#### Without tuning of SVM parameters

In [1]:
params = {'kernel':'rbf', 'C':1}

In [7]:
%run GeoModel.ipynb

for row in metadata.iloc[24:54].iterrows():

    id_dataset = row[1]['id']

    features =  np.genfromtxt(f"data_features\\{row[1]['dataset_path']}", delimiter=',')
    labels = np.genfromtxt(f"data_labels\\{row[1]['target_path']}", delimiter=',')

    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    c_train = X_train[:, -2:]
    c_test = X_test[:, -2:]

    X_train = X_train[:, :-2]
    X_test = X_test[:, :-2]

    params = {'kernel':'rbf', 'C':1}

    GSVM = GeographicalModel('SVR', params, 'fixed', 'gaussian', [200])
    single_bandwidth, best_bandwidth_arr = GSVM.tune(c_train, X_train, y_train, -2, [5,10,15,20,25,30, 35, 40], step=2, limits=True, limits_ind=[20,120,20,120])

    single_bandwidth_arr = np.array([single_bandwidth])

    np.savetxt(f"tuningSVM\\single_bandwidth\\single_bandwidth_id_{id_dataset}.csv", single_bandwidth_arr, delimiter=",")
    np.savetxt(f"tuningSVM\\bandwidth\\bandwidth_id_{id_dataset}.csv", best_bandwidth_arr, delimiter=",")



HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




#### Testing

%run GeoModel.ipynb



In [8]:
%run GeoModel.ipynb

for row in tqdm(metadata.iloc[52:].iterrows(), total=len(metadata.iloc[52:])):

    # load data
    id_dataset = row[1]['id']

    features =  np.genfromtxt(f"data_features\\{row[1]['dataset_path']}", delimiter=',')
    labels = np.genfromtxt(f"data_labels\\{row[1]['target_path']}", delimiter=',')

    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    c_train = X_train[:, -2:]
    c_test = X_test[:, -2:]

    X_train_nocoord = X_train[:, :-2]
    X_test_nocoord = X_test[:, :-2]
    

    limits_ind = [20,120,20,120]
    limits_indx = np.where((c_test[:,0] > limits_ind[0]) & (c_test[:,0] < limits_ind[1])
                            & (c_test[:,1] > limits_ind[2]) & (c_test[:,1] < limits_ind[3]))[0]
    
    c_test_limits = c_test[limits_indx]

    c_test_limits_sc = scaler(c_test_limits, -1, 1)
    
    X_test_nocoord_limits = X_test_nocoord[limits_indx]
    X_test_limits = np.hstack((X_test_nocoord_limits, c_test_limits_sc))

    bandwidth = np.genfromtxt(f'tuningSVM\\bandwidth\\bandwidth_id_{id_dataset}.csv', delimiter=',')
    # get single bandwidth
    single_bandwidth = np.genfromtxt(f'tuningSVM\\single_bandwidth\\single_bandwidth_id_{id_dataset}.csv', delimiter=',')
    single_bandwidth = int(single_bandwidth)

    with open(f'tuningSVM\\paramSVM_coord\\params_id_{id_dataset}.txt') as file:
        param_coord =  eval(file.read())

    with open(f'tuningSVM\\paramSVM\\params_id_{id_dataset}.txt') as file:
        param =  eval(file.read())

    # -------------------------- split for global models
        
    X_train_split, dummy_x, y_train_split, dummy_y = train_test_split(X_train, y_train, test_size=0.6, random_state=42)
    X_train_global, X_test_global, y_train_global, y_test_global = train_test_split(X_train_split, y_train_split, test_size=0.2, random_state=42)

    c_train_global = X_train_global[:, -2:]
    c_test_global = X_test_global[:, -2:]

    X_train_global_nocoord = X_train_global[:, :-2]
    X_test_global_nocoord = X_test_global[:, :-2]

    c_train_global_scaled = scaler(c_train_global, -1, 1)
    c_test_global_scaled = scaler(c_test_global, -1, 1)

    X_train_global_coord = np.hstack((X_train_global_nocoord, c_train_global_scaled))
    X_test_global_coord = np.hstack((X_test_global_nocoord, c_test_global_scaled))


    # train and fit global model
    # with coordinates
    SVM_coord = SVR(**param_coord)
    SVM_coord.fit(X_train_global_coord, y_train_global)
    pred_SVM_coord = SVM_coord.predict(X_test_global_coord)
    
    # without coordinates
    SVM_g = SVR(**param)
    SVM_g.fit(X_train_global_nocoord, y_train_global)
    pred_SVM = SVM_g.predict(X_test_global_nocoord)

    GSVM = GeographicalModel('SVR', param, 'fixed', 'gaussian', [200])
    # local model
    # GRF
    GSVM_pred = GSVM.predict(single_bandwidth, 'fixed', 'linear', c_train, X_train_nocoord, y_train, c_test_limits, X_test_nocoord_limits, -2)

    # MSGRF
    MSGSVM_pred = GSVM.predict(bandwidth, 'fixed', 'linear', c_train, X_train_nocoord, y_train, c_test_limits, X_test_nocoord_limits, -2)

    # save results to csv 
    np.savetxt(f"testingSVM\\SVM_coord\\SVM_coord_id_{id_dataset}.csv", pred_SVM_coord, delimiter=",")
    np.savetxt(f"testingSVM\\SVM\\SVM_id_{id_dataset}.csv", pred_SVM, delimiter=",")
    np.savetxt(f"testingSVM\\GSVM\\GSVM_id_{id_dataset}.csv", GSVM_pred, delimiter=",")
    np.savetxt(f"testingSVM\\MSGSVM\\MSGSVM_id_{id_dataset}.csv", MSGSVM_pred, delimiter=",")

HBox(children=(FloatProgress(value=0.0, max=83.0), HTML(value='')))


