In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
import time
from sklearn.cluster import KMeans
from scipy.spatial import distance

In [49]:
from helpers import (
    collect_cluster_center_target_coordinates,
    insert_column,
    display_errors,
    flat_errors
)


In [50]:
def preprocess_features(x1, x2, reversed=False):
    x1_size, x2_size = len(x1), len(x2)
    features = np.array([np.concatenate((x2[j][:-1], x1[i][:-1])) if reversed else np.concatenate((x1[i][:-1], x2[j][:-1])) 
                         for i in range(x1_size) for j in range(x2_size)])   
    labels = np.array([(x2[j][-1] - x1[i][-1]) if reversed else (x1[i][-1] - x2[j][-1]) 
                       for i in range(x1_size) for j in range(x2_size)])
    return features, labels

In [51]:
def find_yn(z, y_sum, N):
    return np.array([(y_sum + sum(z[i: i + N])) / N for i in range(0, len(z), N)])

In [52]:
train_dataset_path = "./datasets/body_fat_train.txt"
test_dataset_path = "./datasets/body_fat_test.txt"

In [53]:
train_data = np.loadtxt(train_dataset_path,delimiter=',')
test_data = np.loadtxt(test_dataset_path,delimiter=',')

In [56]:
X_train, y_train = train_data[:,:-1], train_data[:,-1:]
X_test, y_test = test_data[:,:-1], test_data[:,-1:]

In [57]:
X = np.concatenate((train_data,test_data),axis=0)

In [58]:
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

def sscore(k, X):
    km = KMeans(n_clusters=k, random_state=42, n_init='auto')
    km.fit_predict(X)
    return silhouette_score(X, km.labels_, metric='euclidean')
    
k_min,k_max = 1,12
ss = []
for k in range(k_min, k_max):
    if k > 1:
        ss.append(sscore(k, X))
    else: 
        ss.append(1)

# fig, ax = plt.subplots(figsize=(12, 4))  # Create a single plot

# kx = range(k_min, k_max)

# ax.set_ylim((0, 1.05))
# ax.set_xticks(kx)
# ax.axhline(y=0.8, color='r', linestyle='-')
# ax.plot(kx, ss, c='green')
# ax.scatter(kx, ss, c='green', marker='o')
# ax.set_xlabel('X Values')
# ax.set_ylabel('Y Values')
# ax.set_title('Combined Line and Scatter Plot')
# ax.legend()

# plt.show()

In [59]:
n_clusters = 4
# без виход
kmeans = KMeans(n_clusters=n_clusters, random_state=0,n_init="auto").fit(X_train)

In [60]:
def calculate_cluster_centers(k, data,best_labels):

    cluster, count = np.unique(best_labels,return_counts=True)
    clusters_y = {i:0 for i in range(k)}
    for index, cluster in enumerate(best_labels):
        clusters_y[cluster] += data[index]
    
    for k in clusters_y.keys():
        clusters_y[k] /= count[k] 
        
    
    return clusters_y

        
cluster_centers_y = calculate_cluster_centers(n_clusters, y_train.flatten(),kmeans.labels_)
# cluster_centers = np.hstack((kmeans_train.cluster_centers_,np.array(list(cluster_centers_y.values())).reshape(-1,1)))
new_y_train = [cluster_centers_y[label] for label in kmeans.labels_]

In [61]:
def euclidean_distance(vector1, vector2):
    return np.linalg.norm(vector1-vector2)

def find_closest_cluster(vector, cluster_centers):
    min_distance = float("inf")
    min_distance_index = float("inf")
    
    for index, cluster in enumerate(cluster_centers):
       distance = euclidean_distance(vector, cluster)

       if distance < min_distance:
           min_distance = distance
           min_distance_index = index
    
    return min_distance_index

test_labels = [find_closest_cluster(vector,kmeans.cluster_centers_) for vector in X_test]
new_y_test = [cluster_centers_y[label] for label in test_labels]

In [62]:
enriched_train_data = np.concatenate((train_data[:,:-1],np.array(new_y_train).reshape(-1,1),train_data[:,-1][:,None]),axis=1)
enriched_test_data = np.concatenate((test_data[:,:-1],np.array(new_y_test).reshape(-1,1),test_data[:,-1][:,None]),axis=1)

In [63]:
y_sum = sum([a[-1] for a in enriched_train_data]) # просумована таргет колонка(вона тут остання) 20.5, 13.3, 19.6, 24.4 ...
N = len(enriched_train_data)
y_target_test = np.array([a[-1] for a in enriched_test_data]) # таргет колонка тесту перетворена у вектор
y_target_train = np.array([a[-1] for a in enriched_train_data]) # таргет колонка трейну перетворена у вектор

In [64]:
train_features, train_labels = preprocess_features(enriched_train_data, enriched_train_data)
test_features, test_labels = preprocess_features(enriched_test_data, enriched_train_data)

In [65]:
scaler = MaxAbsScaler()
scaler.fit(train_features)
train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

# Без виходу

In [66]:
from sklearn.svm import SVR

svr = SVR(kernel="rbf", gamma="scale", coef0=0.0, epsilon=0.001, max_iter=-1)

start_time = time.time()
svr.fit(train_features, train_labels)
print("--- %s seconds ---" % (time.time() - start_time))

train_pred_z = svr.predict(train_features)

pred_z = svr.predict(test_features)

yn_train = find_yn(train_pred_z, y_sum, N)  # застосування методу подвоєних виходів
yn_test = find_yn(pred_z, y_sum, N)

train_errors = flat_errors(y_target_train, yn_train)
test_errors = flat_errors(y_target_test, yn_test)

print("Training erros func:")
display_errors(train_errors)
print("Testing errors func:")
display_errors(test_errors)

--- 0.006999492645263672 seconds ---
Training erros func:

MAPE: 0.08056362977666354
RMSE: 1.913441155032006
MAE: 1.2914140762029118
Max error: 6.035184725081873
Median absolute error: 0.9445928905687477
Mean Squared error: 3.661257053770217
R2: 0.8712879818862524

Testing errors func:

MAPE: 0.0867173091430487
RMSE: 1.5639869312087828
MAE: 1.1864605266629826
Max error: 2.919693218260079
Median absolute error: 1.089719028040541
Mean Squared error: 2.446055120991866
R2: 0.6926744691456143



# З виходом

In [67]:
kmeans_out = KMeans(n_clusters=n_clusters, random_state=0,n_init="auto").fit(train_data)

In [68]:

new_y_train_out = collect_cluster_center_target_coordinates(kmeans_out.cluster_centers_,  kmeans_out.labels_)

cluster_centers_without_y = kmeans_out.cluster_centers_[:,:-1]
test_labels_out = [find_closest_cluster(vector,cluster_centers_without_y) for vector in X_test]
new_y_test_out = [kmeans_out.cluster_centers_[label][kmeans_out.cluster_centers_.shape[1]-1] for label in test_labels_out]

In [69]:
enriched_train_data_out = np.concatenate((train_data[:,:-1],np.array(new_y_train_out).reshape(-1,1),train_data[:,-1][:,None]),axis=1)
enriched_test_data_out = np.concatenate((test_data[:,:-1],np.array(new_y_test_out).reshape(-1,1),test_data[:,-1][:,None]),axis=1)

In [70]:
#train_data = np.concatenate((train_data[:,:-1],ys_train,train_data[:,-1][:,None]),axis=1)
#test_data = np.concatenate((test_data[:,:-1],ys_test,test_data[:,-1][:,None]),axis=1)

In [71]:
train_features_out, train_labels_out = preprocess_features(enriched_train_data_out, enriched_train_data_out) #додаємо в кінець одного вектора інший вектор(процедура аугментації)
# train_labels це наші z_1,z_2,z_3, z_4
# робиться те саме що і в минулому випадку, але навпаки перший вектор йде в кінець а наступні на початок
# train_labels2 точно такі самі як і train_labels тільки з іншим знаком
test_features_out, test_labels_out = preprocess_features(enriched_test_data_out, enriched_train_data_out)

In [72]:
scaler = MaxAbsScaler()
scaler.fit(train_features_out)
train_features_out = scaler.transform(train_features_out)
test_features_out = scaler.transform(test_features_out)

In [73]:
from sklearn.svm import SVR
svr = SVR(kernel='rbf', gamma='scale', coef0=0.0, epsilon=0.001, max_iter=-1)

start_time = time.time()
svr.fit(train_features_out, train_labels_out)
print("--- %s seconds ---" % (time.time() - start_time))

train_pred_z = svr.predict(train_features_out)

pred_z = svr.predict(test_features_out)

yn_train_out = find_yn(train_pred_z, y_sum, N) # застосування методу подвоєних виходів
yn_test_out = find_yn(pred_z, y_sum, N)

train_errors = flat_errors(y_target_train, yn_train_out)
test_errors = flat_errors(y_target_test, yn_test_out)


print('Training erros func:')
display_errors(train_errors)
print('Testing errors func:')
display_errors(test_errors)

--- 0.006997585296630859 seconds ---
Training erros func:

MAPE: 0.08056362977666354
RMSE: 1.913441155032006
MAE: 1.2914140762029118
Max error: 6.035184725081873
Median absolute error: 0.9445928905687477
Mean Squared error: 3.661257053770217
R2: 0.8712879818862524

Testing errors func:

MAPE: 0.0867173091430487
RMSE: 1.5639869312087828
MAE: 1.1864605266629826
Max error: 2.919693218260079
Median absolute error: 1.089719028040541
Mean Squared error: 2.446055120991866
R2: 0.6926744691456143

