# Pruning the metrics 

###### Imports 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import FactorAnalysis, PCA
from sklearn.cluster import KMeans, MeanShift
import collections
%matplotlib inline
plt.rcParams['figure.figsize'] = [9, 9]

## Utilities 

### Helper class to approximate optimal K

In [2]:
class DetK():
    
    """DetK:
    Approximates the optimal number of clusters (K).
    References
    ----------
    https://www.ee.columbia.edu/~dpwe/papers/PhamDN05-kmeans.pdf
    Attributes
    ----------
    optimal_num_clusters_ : int
                            An estimation of the optimal number of clusters K for
                            KMeans models fit to X
    clusters_ : array, [n_clusters]
                The sizes of the clusters
    name_ : string
            The name of this technique
    fs_ : array, [n_clusters]
          The computed evaluation functions F(K) for each cluster size K
    """

    def __init__(self):
        self.optimal_num_clusters_ = None
        self.clusters_ = None
        self.fs_ = None

    def _reset(self):
        """Resets all attributes (erases the model)"""
        self.optimal_num_clusters_ = None
        self.clusters_ = None
        self.fs_ = None

    def fit(self, X, cluster_map):
        """Estimates the optimal number of clusters (K) for a
           KMeans model trained on X.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.
        cluster_map_ : dict
                       A dictionary mapping each cluster size (K) to the KMeans
                       model fitted to X with K clusters
        Returns
        -------
        self
        """
        self._reset()
        n_clusters = len(cluster_map)
        nd = X.shape[1]
        fs = np.empty(n_clusters)
        sks = np.empty(n_clusters)
        alpha = {}
        # K from 1 to maximum_cluster_
        for i, (K, model) \
                in enumerate(sorted(cluster_map.items())):
            # Compute alpha(K, nd) (i.e. alpha[K])
            if K == 2:
                alpha[K] = 1 - 3.0 / (4 * nd)
            elif K > 2:
                alpha[K] = alpha[K - 1] + (1 - alpha[K - 1]) / 6.0
            sks[i] = model.inertia_

            if K == 1:
                fs[i] = 1
            elif sks[i - 1] == 0:
                fs[i] = 1
            else:
                fs[i] = sks[i] / (alpha[K] * sks[i - 1])
        self.clusters_ = np.array(sorted(cluster_map.keys()))
        self.optimal_num_clusters_ = self.clusters_[np.argmin(fs)]
        self.fs_ = fs
        return self

### Obtain Cluster Map Needed for DetK 

In [3]:
def get_cluster_map(X, min_cluster, max_cluster):
    cluster_map = {}
    for k in range(min_cluster, max_cluster + 1):
        tmp = KMeans(n_clusters = k).fit(X)
        if tmp is None:  # Set maximum cluster
            assert K > min_cluster, "min_cluster is too large for the model"
            self.max_cluster_ = k - 1
            break
        else:
            cluster_map[k] = tmp
    return cluster_map

#### Read the CSV 

In [4]:
off = pd.read_csv('offline_workload.CSV')
off_metrics = off.drop(off.columns[[range(14)]], axis=1).transpose()

  result = getitem(key)


#### Perform Factor Analysis with (n=2) components

In [5]:
fa = FactorAnalysis(n_components = 2)
metrics = fa.fit_transform(off_metrics)


In [None]:
cov_matrix = fa.get_covariance()
comp = fa.components_


#### Obtain Cluster map for the metrics 

In [None]:
cluster_map = get_cluster_map(metrics, 1, 10)

#### Calculate the optimum value of K 

In [None]:
opt_k = DetK()
opt_k.fit(metrics, cluster_map)

In [None]:
k = opt_k.optimal_num_clusters_
print(f"# Optimal Clusters for K Means = {k}")

#### Fit KMeans on the metrics 

- **centroids :** Cluster Centroids
- **y_km      :** Labels for each points
- **label_set      :** The set of labels

In [None]:
kmeans = KMeans(n_clusters=k).fit(metrics)
labs = kmeans.labels_
centroids = kmeans.cluster_centers_
y_km = kmeans.predict(metrics)
print(collections.Counter(labs))

label_set = list(set(labs))

#### Plot the clustered points 

In [None]:
###################################
#          TRY ME
###################################

def plot_cluster(X,Y,all_labels,cluster_centers_):
    
    c_dict={0:'lightgreen',1:'orange',2:'lightblue',3:'pink',4:'yellow'}
    
    for i,l in enumerate(set(all_labels)):
        if l != -1: # not outlier
             plt.scatter(X[Y==l,0],X[Y==l,1],
                    s=100, c = c_dict[i],
                    edgecolor='black',
                    label = f'cluster {i+1}'
                   )
        
    # plot the centroids
    plt.scatter(
        cluster_centers_[:, 0], cluster_centers_[:, 1],
        s=75, marker='*',
        c='red', edgecolor='black',
        label='centroids'
    )

    plt.legend(scatterpoints=1)
    plt.grid()
    plt.show()

In [None]:
###################################
#          TRY ME
###################################

plot_cluster(metrics,y_km,label_set,kmeans.cluster_centers_)

In [None]:
plt.scatter(
    metrics[y_km == 0, 0], metrics[y_km == 0, 1],
    s=100, c='lightgreen',
    edgecolor='black',
    label='cluster 1'
)

plt.scatter(
    metrics[y_km == 1, 0], metrics[y_km == 1, 1],
    s=100, c='orange',
    edgecolor='black',
    label='cluster 2'
)

plt.scatter(
    metrics[y_km == 2, 0], metrics[y_km == 2, 1],
    s=100, c='lightblue',
    edgecolor='black',
    label='cluster 3'
)
# plot the centroids
plt.scatter(
    kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
    s=75, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.legend(scatterpoints=1)
plt.grid()
plt.show()

#### Calculate the closest point to centroid in each cluster 

In [None]:
inds=collections.defaultdict(list)

for i in range(len(labs)):
    cluster = labs[i]
    inds[cluster].append(i)

In [None]:
closests=[]

for i in range(len(centroids)):
    inds_i = inds[i]
    dists = (metrics[inds_i] - centroids[i]) ** 2
    closests.append(inds_i[np.argmin(dists)])

In [None]:
print(closests)

#### Select pruned metrics 

In [None]:

drop_list = list(off.columns[:14]) + [off_metrics.index[i] for i in closests]
print(drop_list)

#### Save to file 

In [None]:
pruned = off.drop(off.columns.difference(drop_list), axis=1)

In [None]:
pruned.to_csv('kmeans_pruned.CSV')

## Extension: Pruning Using Mean Shift Clustering 

#### Clustering 

In [None]:
ms = MeanShift(cluster_all = False).fit(metrics)
labs = ms.labels_
y_km = ms.predict(metrics)
centroids = ms.cluster_centers_

label_set = list(set(labs))

#### Plot Clusters 

In [None]:
plot_cluster(metrics,y_km,labs,ms.cluster_centers_)

#### Calculate the closest point to centroid in each cluster 

In [None]:
inds = collections.defaultdict(list)
for i in range(len(labs)):
    inds[labs[i]].append(i)


In [None]:
closests=[]
for i in range(len(centroids)):
    inds_i = inds[i]
    dists = (metrics[inds_i] - centroids[i]) ** 2
    closests.append(inds_i[np.argmin(dists)])

In [None]:
print(closests)

#### Select Pruned Metrics 

In [None]:
drop_list = list(off.columns[:14]) + [off_metrics.index[i] for i in sorted(closests)]
print(drop_list)

In [None]:
pruned = off.drop(off.columns.difference(drop_list), axis=1)

#### Save to File 

In [None]:
pruned.to_csv('mean_shift_pruned.CSV')