In [357]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MaxAbsScaler, StandardScaler,MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from core.input_doubling_method import InputDoublingMethod, InputData
from sklearn.svm import SVR

from core.helpers import collect_cluster_center_target_coordinates
from core.errors import get_errors

# Допоміжні функції

In [358]:
def preprocess_features(x1, x2, reversed=False):
    x1_size, x2_size = len(x1), len(x2)
    features = np.array(
        [
            np.concatenate((x2[j][:-1], x1[i][:-1]))
            if reversed
            else np.concatenate((x1[i][:-1], x2[j][:-1]))
            for i in range(x1_size)
            for j in range(x2_size)
        ]
    )
    labels = np.array(
        [
            (x2[j][-1] - x1[i][-1]) if reversed else (x1[i][-1] - x2[j][-1])
            for i in range(x1_size)
            for j in range(x2_size)
        ]
    )
    return features, labels

In [359]:
def find_yn(z, y_sum, N):
    return np.array([(y_sum + sum(z[i : i + N])) / N for i in range(0, len(z), N)])

In [360]:
def calculate_cluster_centers(k, data, best_labels):

    cluster, count = np.unique(best_labels, return_counts=True)
    clusters_y = {i: 0 for i in range(k)}
    for index, cluster in enumerate(best_labels):
        clusters_y[cluster] += data[index]

    for k in clusters_y.keys():
        clusters_y[k] /= count[k]

    return clusters_y


def euclidean_distance(vector1, vector2):
    return np.linalg.norm(vector1 - vector2)


def find_closest_cluster(vector, cluster_centers):
    min_distance = float("inf")
    min_distance_index = float("inf")

    for index, cluster in enumerate(cluster_centers):
        distance = euclidean_distance(vector, cluster)

        if distance < min_distance:
            min_distance = distance
            min_distance_index = index

    return min_distance_index

# Вичитка даних

In [361]:
kmeans_random_state = 42
kmeans_random_state_out = 42

In [362]:
# train_dataset_path = "./datasets/FractureTrain.txt"
# test_dataset_path = "./datasets/FractureTest.txt"

# train_data = np.loadtxt(train_dataset_path, delimiter=",")
# test_data = np.loadtxt(test_dataset_path, delimiter=",")
# X_train, y_train = train_data[:, :-1], train_data[:, -1:]
# X_test, y_test = test_data[:, :-1], test_data[:, -1:]

# keep in mind that check for variable in locals mean that it will require cleaning state all the time
if not "dataset" in locals():
    dataset = "franke_function"
print("Running on dataset: ", dataset)

Running on dataset:  franke_function


In [363]:
data = (
    pd.read_csv(f"./datasets/{dataset}.csv")
    .to_numpy()
)
X_train, X_test, y_train, y_test = train_test_split(
    data[:, :-1], data[:, -1:], test_size=0.2, random_state=42
)

train_data = np.concatenate((X_train, y_train), axis=1)
test_data = np.concatenate((X_test, y_test), axis=1)

In [364]:
y_sum = sum(
    [a[0] for a in y_train]
)  # просумована таргет колонка(вона тут остання) 20.5, 13.3, 19.6, 24.4 ...
N = len(train_data)

# Базові опції

In [365]:
svr = SVR(kernel="rbf", gamma="scale", coef0=0.0, epsilon=0.001, max_iter=-1)
gbr = GradientBoostingRegressor(random_state=42)
rfg = RandomForestRegressor(random_state=42, max_depth=5)
output_errors_train, output_errors_test  = {},{}

# Без виходу

In [366]:
N_CLUSTERS = 2
if not 'N_CLUSTERS' in locals():
    N_CLUSTERS = 7
print("Num of clusters: ", N_CLUSTERS)
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=kmeans_random_state, n_init="auto").fit(X_train)

Num of clusters:  2


In [367]:
cluster_centers_y = calculate_cluster_centers(
    N_CLUSTERS, y_train.flatten(), kmeans.labels_
)
# cluster_centers = np.hstack((kmeans_train.cluster_centers_,np.array(list(cluster_centers_y.values())).reshape(-1,1)))
new_y_train = [cluster_centers_y[label] for label in kmeans.labels_]

In [368]:
test_labels = [
    find_closest_cluster(vector, kmeans.cluster_centers_) for vector in X_test
]
new_y_test = [cluster_centers_y[label] for label in test_labels]

In [369]:
enriched_train_data = np.concatenate(
    (
        train_data[:, :-1],
        np.array(new_y_train).reshape(-1, 1),
        train_data[:, -1][:, None],
    ),
    axis=1,
)
enriched_test_data = np.concatenate(
    (test_data[:, :-1], np.array(new_y_test).reshape(-1, 1), test_data[:, -1][:, None]),
    axis=1,
)

In [370]:
y_sum = sum(
    [a[-1] for a in enriched_train_data]
)  # просумована таргет колонка(вона тут остання) 20.5, 13.3, 19.6, 24.4 ...
N = len(enriched_train_data)
y_target_test = np.array(
    [a[-1] for a in enriched_test_data]
)  # таргет колонка тесту перетворена у вектор
y_target_train = np.array(
    [a[-1] for a in enriched_train_data]
)  # таргет колонка трейну перетворена у вектор

In [371]:
train_features, train_labels = preprocess_features(
    enriched_train_data, enriched_train_data
)
test_features, test_labels = preprocess_features(
    enriched_test_data, enriched_train_data
)

In [372]:
scaler = MaxAbsScaler()
scaler.fit(train_features)
train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

# Результати без виходу

In [373]:
idm = InputDoublingMethod(y_sum=y_sum, N=N)
input_data = InputData(
    train_features,
    train_labels,
    test_features,
    test_labels,
    y_target_train,
    y_target_test,
)
# ---------------------------------------------------------------------------------
yn_train, yn_test = idm.apply(svr, input_data)
train_erros = [value for name, value in get_errors(y_target_train, yn_train)]
test_errors = [value for name, value in get_errors(y_target_test, yn_test)]

output_errors_train[f"без виходу SVR - {N_CLUSTERS}"] = train_erros
output_errors_test[f"без виходу SVR - {N_CLUSTERS}"] = test_errors

--- 2.786806583404541 seconds ---


# З виходом

In [374]:
kmeans_out = KMeans(n_clusters=N_CLUSTERS, random_state=kmeans_random_state_out, n_init="auto").fit(
    train_data
)

In [375]:
cluster_centers_without_y = kmeans_out.cluster_centers_[:, :-1]
test_labels_out = [
    find_closest_cluster(vector, cluster_centers_without_y) for vector in X_test
]

new_y_train_out = collect_cluster_center_target_coordinates(
    kmeans_out.cluster_centers_, kmeans_out.labels_
)
new_y_test_out = [
    kmeans_out.cluster_centers_[label][kmeans_out.cluster_centers_.shape[1] - 1]
    for label in test_labels_out
]

In [376]:
enriched_train_data_out = np.concatenate(
    (
        train_data[:, :-1],
        np.array(new_y_train_out).reshape(-1, 1),
        train_data[:, -1][:, None],
    ),
    axis=1,
)
enriched_test_data_out = np.concatenate(
    (
        test_data[:, :-1],
        np.array(new_y_test_out).reshape(-1, 1),
        test_data[:, -1][:, None],
    ),
    axis=1,
)

In [377]:
train_features_out, train_labels_out = preprocess_features(
    enriched_train_data_out, enriched_train_data_out
)  # додаємо в кінець одного вектора інший вектор(процедура аугментації)
# train_labels це наші z_1,z_2,z_3, z_4
# робиться те саме що і в минулому випадку, але навпаки перший вектор йде в кінець а наступні на початок
# train_labels2 точно такі самі як і train_labels тільки з іншим знаком
test_features_out, test_labels_out = preprocess_features(
    enriched_test_data_out, enriched_train_data_out
)

In [378]:
scaler = MaxAbsScaler()
scaler.fit(train_features_out)
train_features_out = scaler.transform(train_features_out)
test_features_out = scaler.transform(test_features_out)

# Результати з виходом

In [379]:
idm_out = InputDoublingMethod(y_sum=y_sum, N=N)
input_data_out = InputData(
    train_features_out,
    train_labels_out,
    test_features_out,
    test_labels_out,
    y_target_train,
    y_target_test,
)
# ---------------------------------------------------------------------------------
yn_train, yn_test = idm_out.apply(svr, input_data_out)
train_erros = [value for name, value in get_errors(y_target_train, yn_train)]
test_errors = [value for name, value in get_errors(y_target_test, yn_test)]

output_errors_train[f"З виходом SVR - {N_CLUSTERS}"] = train_erros
output_errors_test[f"З виходом SVR - {N_CLUSTERS}"] = test_errors

--- 2.516742467880249 seconds ---


In [384]:
InputDoublingMethod.save_errors_to_csv(output_errors_train, f"results/{dataset}_errors_train_n{N_CLUSTERS}.csv")
InputDoublingMethod.save_errors_to_csv(output_errors_test, f"results/{dataset}_errors_test_n{N_CLUSTERS}.csv")