In [1]:
dfs = {}
preprocessed = {}

In [2]:
import rdata

converted = rdata.read_rda("E:\projects\microagg1d\datasets\Tarragona.rda")
converted["Tarragona"]
dfs["Tarragona"] = converted["Tarragona"]
preprocessed["Tarragona"] = converted["Tarragona"]

In [3]:
converted = rdata.read_rda("E:\projects\microagg1d\datasets\EIA.rda")
converted["EIA"]
dfs["eia"] = converted["EIA"]
preprocessed["eia"]=dfs["eia"][['RESREVENUE', 'RESSALES', 'COMREVENUE', 'COMSALES', 'INDREVENUE', 'INDSALES', 'OTHREVENUE', 'OTHRSALES', 'TOTREVENUE', 'TOTSALES']]

In [4]:
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [5]:
from microagg1d import univariate_microaggregation
from microagg1d.common import compute_cluster_cost_sorted
from microagg1d.cost import compute_cost
from microagg1d.main import undo_argsort
import numpy as np

In [6]:
def square_sum(x):
    return np.sum(np.square(x))

def absolute_sum(x):
    return np.sum(np.abs(x))

def multi_cost(df_in, clusters, cost_name="sse", preprocess=True):
    df = do_preprocessing(df_in, preprocess)
    average = {
        "sse" : np.mean,
        "sae" : np.median,
    }[cost_name]
    cost_func = {
        "sse" : square_sum,
        "sae" : absolute_sum,
    }[cost_name]
    num_clusters = clusters.max() + 1
    total_cost=0
    for i in range(num_clusters):
        select = clusters == i
        df_select = df[select,:]
        centroid = average(df_select,axis=0)
        total_cost += cost_func(df_select-centroid)
    return total_cost

In [7]:
def do_preprocessing(df_in, preprocess):
    if preprocess:
        return preprocessing.StandardScaler().fit_transform(df_in)
    return df_in

In [8]:
def compute_multi_pca(df_in, k, cost_name="sse", preprocess=True):
    df = do_preprocessing(df_in, preprocess)
    pca = PCA(n_components=1)
    pca.fit(df)
    vec = pca.components_[0,:]
    arr = df @ vec
    order = np.argsort(arr)
    arr = np.array(arr[order], dtype=np.float64)
    clusters = univariate_microaggregation(arr, cost=cost_name, k=k)
    return undo_argsort(clusters, order), vec, order

In [9]:
def compute_multi_random(df_in, k, num_tries=10, cost_name="sse", seed=None, preprocess=True):
    df = do_preprocessing(df_in, preprocess)
    if seed is not None:
        np.random.seed(seed)
    best_clusters= None
    best_vec = None
    best_cost = np.inf
    for i in range(num_tries):
        vec = np.random.rand(df.shape[1])
        arr = df @ vec
        order = np.argsort(arr)
        arr = np.array(arr[order], dtype=np.float64)
        clusters = univariate_microaggregation(arr, cost=cost_name, k=k)
        clusters = undo_argsort(clusters, order)
        cost = multi_cost(df, clusters, cost_name=cost_name, preprocess=False)
        if cost < best_cost:
            best_clusters=clusters
            best_cost=cost
            best_arr = undo_argsort(arr, order)
            best_order = undo_argsort(order, order)
        
    return best_clusters, best_vec, best_order, best_cost

In [10]:
def k_means_heuristic(df_in, k, n_clusters, seed=None, preprocess=True):
    if seed is not None:
        np.random.seed(seed)
    df = do_preprocessing(df_in, preprocess)
    kmeans = KMeans(n_clusters=n_clusters, random_state=seed).fit(df)
    all_labels = set(kmeans.labels_)
    labels = kmeans.labels_.copy()
    centroids = kmeans.cluster_centers_.copy()
    changes=False
    while True:
        group_sizes = np.bincount(labels)
        labels_to_iter = np.random.permutation(np.fromiter(all_labels, count=len(all_labels), dtype=np.int64))
        if np.all(group_sizes[labels_to_iter] >= k):
            break
        
        
        
        for group_label in labels_to_iter:
            group_size = group_sizes[group_label]
            if group_size < k:
                this_centroid = centroids[group_label]
                min_dist = np.inf
                best_neigh = None
                for i, centroid in enumerate(centroids):
                    if i==group_label or group_sizes[i]==0:
                        continue
                    
                    distance = np.sum(np.square(centroid - this_centroid))
                    if distance < min_dist:
                        best_neigh = i
                        min_dist = distance
                centroids[best_neigh] = (centroids[best_neigh]*group_sizes[best_neigh] + 
                                         centroids[group_label]*group_sizes[group_label])/ (
                                        group_sizes[best_neigh] + group_sizes[group_label])
                
                labels[labels==group_label] = best_neigh
                group_sizes[best_neigh] += group_sizes[group_label]
                group_sizes[group_label] = 0
                all_labels.remove(group_label)
                # print("removing", group_label, "to", best_neigh)
                tmp = np.bincount(labels,minlength=len(group_sizes))
                if not np.all(group_sizes == tmp):
                    print(group_sizes)
                    print(tmp)
                    print(group_sizes-tmp)
                    raise ValueError()    
    final_labels = labels.copy()
    
    d = {l:i for i, l in enumerate(all_labels)}
    for i,l in enumerate(labels):
        final_labels[i] = d[l]
    return final_labels

In [11]:
from sklearn.cluster import KMeans

In [12]:
cost_name = "sse"

In [13]:
from collections import defaultdict
results_pca = defaultdict(list)

In [14]:
the_ks = [2,3,4,5,7,10,20,30,50]
num_repeats=10
list_num_tries = [10, 50, 100]

In [15]:
results = {}

In [17]:
results["pca"] = defaultdict(list)
for name, df in preprocessed.items():
    df = preprocessing.StandardScaler().fit_transform(df)
    for k in the_ks:
        clusters, arr, order = compute_multi_pca(df, k=k, preprocess=False)
        cost = multi_cost(df, clusters, cost_name=cost_name)

        results["pca"][name].append((k, cost))

In [18]:
from tqdm.notebook import tqdm

In [None]:
np.random.seed(0)

for num_tries in tqdm(list_num_tries):
    label = f"random_{num_tries}"
    results[label] = defaultdict(list)
    for name, df in preprocessed.items():
        df = preprocessing.StandardScaler().fit_transform(df)
        for k in tqdm(the_ks, leave=False):
            costs = []
            for seed in tqdm(range(num_repeats), leave=False):
                clusters, arr, order, cost = compute_multi_random(df, k=k, num_tries=num_tries, cost_name=cost_name, preprocess=False)
                costs.append(cost)
            results[label][name].append((k, np.mean(costs), np.std(costs)))

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
np.random.seed(0)

label = f"merged"
results[label] = defaultdict(list)
for name, df in preprocessed.items():
    df = preprocessing.StandardScaler().fit_transform(df)
    for k in tqdm([2,3,4,5,7,10,20,30,50], leave=False):
        costs = []
        for seed in tqdm(range(num_repeats), leave=False):
            clusters = k_means_heuristic(df, k=k, n_clusters=int(len(df)/k))
            cost = multi_cost(df, clusters, cost_name=cost_name, preprocess=True)
            costs.append(cost)
        results[label][name].append((k, np.mean(costs), np.std(costs)))

In [None]:
for name, df in preprocessed.items():
    print(name, len(df))

In [None]:
import matplotlib.pyplot as plt

In [None]:
for dataset in ["eia", "Tarragona"]:
    plt.figure()
    for method in ["pca", "random_10", "random_50", "random_100", "merged"]:
        res = results[method][dataset]
        arr = np.array(res)
        # print(arr, method, dataset)
        if arr.shape[1]==3:
            x = arr[:,0]
            y = arr[:,1]
            yerr = arr[:,2]
            plt.errorbar(x, y, yerr=yerr, label=method)
        else:
            x = arr[:,0]
            y = arr[:,1]
            plt.plot(x,y, label=method)
    plt.legend()
    plt.xlabel("minimum group size k")
    plt.ylabel("Reconstruction Error")
    plt.title(dataset)