# Agglomerative Clustering

In [None]:
%matplotlib notebook

In [None]:
import os
from tqdm import tqdm
import numpy as np
from sklearn import cluster
import scipy.cluster.hierarchy as sch
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

In [None]:
from extract_func import process_file

In [None]:
import json
with open('./func_counter.json','r') as f:
    func_counter = json.load(f)
sklearn_counter = {k: func_counter[k] for k in func_counter if k.startswith('sklearn.')}

But what is the point to do clustering before choosing representative functions?
Why can't we just sort all the functions and choose short ones? 

In [None]:
nb_path = '/projects/bdata/jupyter/target'
notebooks = os.listdir(nb_path)

filter notebooks (which import sklearn)

In [None]:
sklearn_notebooks = []
for nb in tqdm(notebooks):
    with open(os.path.join(nb_path, nb),'r') as f:
#         print('--')
        content = f.read()
    tokens = content.split()
    if 'sklearn' in tokens:
        sklearn_notebooks.append(nb)
    

In [None]:
func2vector = {}
vector_size = len(sklearn_notebooks)
err_files = []
for i, nb in enumerate(sklearn_notebooks):
    if i%10000 == 0:
        print('Log: {} notebooks processed'.format(i))
    funcs = []
    try:
        funcs, linenos = process_file(os.path.join(nb_path, nb))
    except Exception as e:
        err_files.append(nb)
    funcs = [func for func in funcs if func.startswith('sklearn.')]
    for func in funcs:
#         if not func.startswith('sklearn'):
#             continue
        if func not in func2vector:
            func2vector[func] = np.zeros(vector_size)
        func2vector[func][i] = 1

In [None]:
idx2func = list(func2vector.keys())
func2idx = {f: i for i, f in enumerate(idx2func)}
vectors = [func2vector[f] for f in idx2func]
occur_matrix = np.stack(vectors)
cooccur_matrix = np.dot(occur_matrix, occur_matrix.T)

In [None]:
np.save('sklearn_cooccur_mat.npy', cooccur_matrix)

In [None]:
model = cluster.AgglomerativeClustering(n_clusters = None, distance_threshold=0.1, affinity="precomputed", linkage="average").fit(a)

In [None]:
clusters = []
n_samples = len(idx2func)
for child in model.children_:
    clusters.append([])
    for branch in child:
        if branch < n_samples:
            clusters[-1].append(idx2func[branch])
        else:
            clusters[-1]+=clusters[branch-n_samples]

In [None]:
with open('./cluster.json','r') as f:
    clusters = json.load(f)

In [None]:
decision_points = []
last_point = ''


In [None]:
def find_representatives_in_cluster(cluster):
    """
    cluster: ["sklearn.linear_model.LogisticRegression",
              "sklearn.linear_model.LogisticRegression.fit"]
    """
    representatives = []
    if len(cluster)>10 or len(cluster)<3:
        return representatives
    cluster = sorted(cluster)
    last_point = "*"
    for func in cluster:
        if not func.startswith(last_point):
            last_point = func
            representatives.append(last_point)
    return representatives
            

In [None]:
decision_points = []
for i, c in enumerate(clusters['clusters']):
    points = find_representatives_in_cluster(c)
    decision_points+=points
#     if 'sklearn.base.clone.fit' in points:
    if 'sklearn.cluster.Birch.fit' in points:
#         sklearn.cluster.Birch.fit
        print(i)
        print(sorted(c))
decision_points = list(set(decision_points))    

In [None]:
rp_counter = {f: sklearn_counter[f] for f in sorted(decision_points) if f in sklearn_counter}

In [None]:
def plot_dendrogram(model, **kwargs):

    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


In [None]:
# model = AgglomerativeClustering(n_clusters=3)

# model = model.fit(x)
plt.title('Hierarchical Clustering Dendrogram')
plot_dendrogram(model, labels=model.labels_)
plt.show()

In [None]:
a = 1- cooccur_matrix/cooccur_matrix.max()