# What does thid notebook do?  
using agglomerative clustering to find posible decision points in all notebooks  

# File mapping 
sklearn | clustering | ---: `./sklearn_dp.txt`

# Agglomerative Clustering

In [1]:
%matplotlib notebook

In [2]:
import os
from tqdm import tqdm
import numpy as np
from sklearn import cluster
import scipy.cluster.hierarchy as sch
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

In [3]:
from importlib import import_module

In [4]:
from extract_func import process_file

In [111]:
class Tree(object):
    """tree sturcture to record currently found decision points"""
    def __init__(self, lib, funcs = []):
        """init a tree with a list of functions or []"""
        super(Tree, self).__init__()
        self.lib = lib
        self.root = {}
        for f in funcs:
            self.add_func(f)
    
    def add_func(self, func):
        """add function to tree"""
        tokens = func.split('.')
        cur_root = self.root
        for i, t in enumerate(tokens):
            if t not in cur_root:
                cur_root[t] = {}
            cur_root = cur_root[t]
            
    def add_funcs(self, funcs):
        """add functions to tree"""
        for f in funcs:
            self.add_func(f)

    def check_node(self, node):
        """delete leaves if very deep but not wide"""
        if len(node)<3:
            return {}
        else:
            node = {k: self.check_node(node[k]) for k in node}
            return node
    
    def cut_tree(self):
        self.cut_root = {self.lib: self.check_node(self.root[self.lib])}
        
    def tolist(self, prefix, tree):
        results = []
        for k in tree:
            if tree[k] =={}:
                results.append('{}.{}'.format(prefix,k) if prefix!='' else k)
            else:
                results+=self.tolist('{}.{}'.format(prefix,k) if prefix!='' else k, tree[k])
        return results
        
        

In [7]:
import json
with open('./func_counter.json','r') as f:
    func_counter = json.load(f)["func_counter"]
sklearn_counter = {k: func_counter[k] for k in func_counter if k.startswith('sklearn.')}

In [61]:
sklearn_counter

{'sklearn.datasets.load_diabetes': 1061,
 'sklearn.linear_model.Lasso': 5232,
 'sklearn.cross_validation.cross_val_score': 19251,
 'sklearn.linear_model.LassoCV': 1119,
 'sklearn.cross_validation.KFold': 6901,
 'sklearn.linear_model.LassoCV.fit': 1338,
 'sklearn.datasets.load_digits': 3636,
 'sklearn.datasets.load_digits.images.reshape': 317,
 'sklearn.model_selection.train_test_split': 15010,
 'sklearn.model_selection.GridSearchCV': 6546,
 'sklearn.model_selection.GridSearchCV.fit': 5496,
 'sklearn.model_selection.GridSearchCV.predict': 1221,
 'sklearn.feature_extraction.text.CountVectorizer': 9520,
 'sklearn.feature_extraction.text.CountVectorizer.fit': 1828,
 'sklearn.feature_extraction.text.CountVectorizer.transform': 3773,
 'sklearn.feature_extraction.text.CountVectorizer.get_feature_names': 2270,
 'sklearn.preprocessing.LabelEncoder': 8892,
 'sklearn.preprocessing.LabelEncoder.fit': 5148,
 'sklearn.preprocessing.LabelEncoder.transform': 5966,
 'sklearn.preprocessing.LabelEncoder.

But what is the point to do clustering before choosing representative functions?
Why can't we just sort all the functions and choose short ones? 

In [9]:
# nb_path = '/projects/bdata/jupyter/target'
nb_path = '/home/gezhang/data/jupyter/target'
notebooks = os.listdir(nb_path)

filter notebooks (which import sklearn)

In [12]:
sklearn_notebooks = []
for i, nb in enumerate(notebooks):
    if i%10000==0:
        print(i)
    with open(os.path.join(nb_path, nb),'r') as f:
#         print('--')
        content = f.read()
    tokens = content.split()
    if 'sklearn' in tokens:
        sklearn_notebooks.append(nb)
    

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000


# delete loops:
eg  
**sklearn.tree.DecisionTreeClassifier.fit.fit.fit**  
is equal to   
**sklearn.tree.DecisionTreeClassifier.fit**  
  
the only difference is that the first one is called multiple times  

In [13]:
def delete_loop(func):
    """
    delete continue loops in a function
    simplify **sklearn.tree.DecisionTreeClassifier.fit.fit.fit** 
          to **sklearn.tree.DecisionTreeClassifier.fit**
    """
    tokens = func.split('.')
    new_tokens = []
    for t in tokens:
        if new_tokens == [] or t!=new_tokens[-1]:
            new_tokens.append(t)
    return '.'.join(new_tokens)

In [14]:
no_loop_sklearn_funcs = []
for k in sklearn_counter:
    t = delete_loop(k)
    if t not in no_loop_sklearn_funcs:
        no_loop_sklearn_funcs.append(t)
no_loop_sklearn_funcs

['sklearn.datasets.load_diabetes',
 'sklearn.linear_model.Lasso',
 'sklearn.cross_validation.cross_val_score',
 'sklearn.linear_model.LassoCV',
 'sklearn.cross_validation.KFold',
 'sklearn.linear_model.LassoCV.fit',
 'sklearn.datasets.load_digits',
 'sklearn.datasets.load_digits.images.reshape',
 'sklearn.model_selection.train_test_split',
 'sklearn.model_selection.GridSearchCV',
 'sklearn.model_selection.GridSearchCV.fit',
 'sklearn.model_selection.GridSearchCV.predict',
 'sklearn.feature_extraction.text.CountVectorizer',
 'sklearn.feature_extraction.text.CountVectorizer.fit',
 'sklearn.feature_extraction.text.CountVectorizer.transform',
 'sklearn.feature_extraction.text.CountVectorizer.get_feature_names',
 'sklearn.preprocessing.LabelEncoder',
 'sklearn.preprocessing.LabelEncoder.fit',
 'sklearn.preprocessing.LabelEncoder.transform',
 'sklearn.preprocessing.LabelEncoder.fit_transform',
 'sklearn.cross_validation.StratifiedKFold',
 'sklearn.feature_extraction.text.HashingVectorizer',


In [15]:
func2vector = {}
vector_size = len(sklearn_notebooks)
err_files = []
for i, nb in enumerate(sklearn_notebooks):
    if i%10000 == 0:
        print('Log: {} notebooks processed'.format(i))
    funcs = []
    try:
        funcs, linenos = process_file(os.path.join(nb_path, nb))
    except Exception as e:
        err_files.append(nb)
    funcs = [func for func in funcs if func in no_loop_sklearn_funcs]
    for func in funcs:
#         if not func.startswith('sklearn'):
#             continue
        if func not in func2vector:
            func2vector[func] = np.zeros(vector_size)
        func2vector[func][i] = 1

Log: 0 notebooks processed
Log: 10000 notebooks processed
Log: 20000 notebooks processed
Log: 30000 notebooks processed
Log: 40000 notebooks processed
Log: 50000 notebooks processed
Log: 60000 notebooks processed
Log: 70000 notebooks processed
Log: 80000 notebooks processed
Log: 90000 notebooks processed


In [132]:
idx2func = list(func2vector.keys())
func2idx = {f: i for i, f in enumerate(idx2func)}
vectors = [func2vector[f] for f in idx2func]
occur_matrix = np.stack(vectors)
cooccur_matrix = np.dot(occur_matrix, occur_matrix.T)

In [133]:
a = 1- cooccur_matrix/cooccur_matrix.max()

In [134]:
model = cluster.AgglomerativeClustering(n_clusters = None, distance_threshold=0.996, affinity="precomputed", linkage="average").fit(a)

In [135]:
model.n_clusters_

2910

In [136]:
new_clusters = {}
# new_clusters = [[]]*model.n_clusters_
for i, l in enumerate(model.labels_):
    if l not in new_clusters:
        new_clusters[l] = []
    new_clusters[l].append(idx2func[i])
#     print(i)
clusters = list(new_clusters.values())

In [137]:
clusters

[['sklearn.feature_extraction.text.TfidfVectorizer',
  'sklearn.feature_extraction.text.TfidfVectorizer.transform',
  'sklearn.feature_extraction.text.CountVectorizer',
  'sklearn.feature_extraction.text.CountVectorizer.fit',
  'sklearn.feature_extraction.text.CountVectorizer.transform',
  'sklearn.naive_bayes.MultinomialNB',
  'sklearn.naive_bayes.MultinomialNB.fit',
  'sklearn.feature_extraction.text.CountVectorizer.get_feature_names',
  'sklearn.naive_bayes.MultinomialNB.predict',
  'sklearn.feature_extraction.text.CountVectorizer.fit_transform',
  'sklearn.pipeline.Pipeline',
  'sklearn.naive_bayes.MultinomialNB.fit.predict',
  'sklearn.feature_extraction.text.TfidfVectorizer.fit_transform',
  'sklearn.pipeline.Pipeline.fit',
  'sklearn.feature_extraction.text.TfidfTransformer',
  'sklearn.feature_extraction.text.TfidfTransformer.fit_transform',
  'sklearn.feature_extraction.text.TfidfTransformer.transform',
  'sklearn.pipeline.Pipeline.predict',
  'sklearn.datasets.fetch_20newsgro

In [139]:
def is_parent_child(parent_func, child_func):
    if len(parent_func)>=len(child_func):
        return False
    if child_func.startswith(parent_func):
        if len(child_func.split('.'))>len(parent_func.split('.')):
            return True
        else:
            return False
    else:
        return False

In [140]:
not_root = []
sets = []
has_root = []

def find_root_in_cluster(cluster):
    """
    find a representative function in a cluster like:
     ['sklearn.preprocessing.MinMaxScaler',
      'sklearn.preprocessing.MinMaxScaler.fit_transform',
      'sklearn.preprocessing.MinMaxScaler.transform']
    return a list []
    """
#     if len(cluster)>3:
#         return []
    cluster = sorted(cluster)
    last_root = '*'
    cluster_roots = []
    for f in cluster:
        if not f.startswith(last_root):
            cluster_roots.append(f)
            last_root = f
        elif f not in has_root:
            has_root.append(f)
    cluster_roots = [r for r in cluster_roots if not any([is_parent_child(root, r) for root in roots])]
    return cluster_roots
            

    

In [142]:
roots = []
tree = Tree('sklearn')
for c in tqdm(clusters):
#     roots+=find_root_in_cluster(c)
#     print(c)
    funcs = find_root_in_cluster(c)
#     if "sklearn.cluster.SpectralClustering" in funcs or "sklearn.cluster.SpectralClustering.fit" in funcs:
#         print(c)
#     if 
    roots+=funcs
#     print(funcs)
    tree.add_funcs(funcs)
# print(json.dumps(tree.root, ensure_ascii=False, indent=2))
    

100%|██████████| 2910/2910 [00:00<00:00, 7039.03it/s]


In [143]:
sorted(roots)

['sklearn.GradientBoostingClassifier',
 'sklearn.GridSearchCV',
 'sklearn.MLPClassifier',
 'sklearn.__version__.split',
 'sklearn.base.BaseEstimator.get_params',
 'sklearn.base.BaseEstimator.set_params',
 'sklearn.base.clone',
 'sklearn.calibration.CalibratedClassifierCV',
 'sklearn.calibration.calibration_curve',
 'sklearn.clone',
 'sklearn.cluster.AffinityPropagation',
 'sklearn.cluster.AgglomerativeClustering',
 'sklearn.cluster.Birch',
 'sklearn.cluster.DBSCAN',
 'sklearn.cluster.FeatureAgglomeration',
 'sklearn.cluster.KMeans',
 'sklearn.cluster.KMedoids',
 'sklearn.cluster.MeanShift',
 'sklearn.cluster.Meanshift',
 'sklearn.cluster.MiniBatchKMeans',
 'sklearn.cluster.SpectralClustering',
 'sklearn.cluster.SpectralClustering.fit',
 'sklearn.cluster.Ward',
 'sklearn.cluster.WardAgglomeration',
 'sklearn.cluster.affinity_propagation',
 'sklearn.cluster.append',
 'sklearn.cluster.bicluster.SpectralBiclustering',
 'sklearn.cluster.bicluster.SpectralCoclustering',
 'sklearn.cluster.bir

In [144]:
len(roots)

579

In [145]:
# 删去representatives 中还有sub 关系的函数（只相差一级的话）
dp = []
for f in sorted(roots):
    if dp==[] or len(dp[-1].split('.'))<3:
        dp.append(f)
#     elif not f.startswith(dp[-1]):

#         dp.append(f)
#     elif len(f.split('.'))-len(dp[-1].split('.'))!=1:
#         dp.append(f)
#     else:
#         continue
    elif not is_parent_child(dp[-1], f):
        dp.append(f)
    else:
        continuenue

In [146]:
len(dp)

557

In [147]:
dp

['sklearn.GradientBoostingClassifier',
 'sklearn.GridSearchCV',
 'sklearn.MLPClassifier',
 'sklearn.__version__.split',
 'sklearn.base.BaseEstimator.get_params',
 'sklearn.base.BaseEstimator.set_params',
 'sklearn.base.clone',
 'sklearn.calibration.CalibratedClassifierCV',
 'sklearn.calibration.calibration_curve',
 'sklearn.clone',
 'sklearn.cluster.AffinityPropagation',
 'sklearn.cluster.AgglomerativeClustering',
 'sklearn.cluster.Birch',
 'sklearn.cluster.DBSCAN',
 'sklearn.cluster.FeatureAgglomeration',
 'sklearn.cluster.KMeans',
 'sklearn.cluster.KMedoids',
 'sklearn.cluster.MeanShift',
 'sklearn.cluster.Meanshift',
 'sklearn.cluster.MiniBatchKMeans',
 'sklearn.cluster.SpectralClustering',
 'sklearn.cluster.Ward',
 'sklearn.cluster.WardAgglomeration',
 'sklearn.cluster.affinity_propagation',
 'sklearn.cluster.append',
 'sklearn.cluster.bicluster.SpectralBiclustering',
 'sklearn.cluster.bicluster.SpectralCoclustering',
 'sklearn.cluster.birch.Birch',
 'sklearn.cluster.data.argsort',

In [127]:
sorted(roots)

['sklearn.base.clone',
 'sklearn.base.clone.fit',
 'sklearn.calibration.CalibratedClassifierCV',
 'sklearn.calibration.CalibratedClassifierCV.fit',
 'sklearn.calibration.calibration_curve',
 'sklearn.clone',
 'sklearn.cluster.AffinityPropagation',
 'sklearn.cluster.AffinityPropagation.fit',
 'sklearn.cluster.AgglomerativeClustering',
 'sklearn.cluster.AgglomerativeClustering.fit',
 'sklearn.cluster.Birch',
 'sklearn.cluster.DBSCAN',
 'sklearn.cluster.DBSCAN.fit',
 'sklearn.cluster.FeatureAgglomeration',
 'sklearn.cluster.KMeans',
 'sklearn.cluster.KMeans.fit',
 'sklearn.cluster.MeanShift',
 'sklearn.cluster.MiniBatchKMeans',
 'sklearn.cluster.MiniBatchKMeans.fit',
 'sklearn.cluster.MiniBatchKMeans.predict',
 'sklearn.cluster.SpectralClustering',
 'sklearn.cluster.SpectralClustering.fit_predict',
 'sklearn.cluster.Ward',
 'sklearn.cluster.Ward.fit',
 'sklearn.cluster.affinity_propagation',
 'sklearn.cluster.bicluster.SpectralBiclustering',
 'sklearn.cluster.bicluster.SpectralBiclusterin

In [148]:
print(len(roots))
print(len(dp))

579
557


# Expansion

In [None]:
# extract structure from statsmodels_roots
statsmodels_roots = ['.'.join(f.split('.')[:-1]) for f in sorted(tree.tolist('',tree.cut_root))]

In [None]:
roots_counter = {}
for r in statsmodels_roots:
    if r not in roots_counter:
        roots_counter[r]=0
    roots_counter[r]+=1
    
    
frequent_roots = []
for r in roots_counter:
    try:
        mod = import_module(r)
        if  roots_counter[r]/len(dir(mod))>0.1:
            frequent_roots.append(r)
    except:
        pass
    
for r in frequent_roots:
    try:
        mod = import_module(r)
        try:
            dps = ['{}.{}'.format(r, obj) for obj in mod.__all__] 
        except:
            dps =  ['{}.{}'.format(r, obj) for obj in dir(mod) if not obj.startswith('_')]
        statsmodels_dp+=dps
    except Exception as e:
        print(e)
statsmodels_dp = sorted(list(set(statsmodels_dp)))

In [None]:
len(statsmodels_dp)

In [None]:
for f in statsmodels_dp:
    print(f)

In [None]:
v1 = func2vector["sklearn.cluster.Birch.fit"]
v2 = func2vector["sklearn.cluster.Birch"]
v3 = func2vector["sklearn.cluster.AgglomerativeClustering.labels_.astype"]
np.linalg.norm(v1- v3)