# Calculate top n instead of clustering 

In [3]:
from utils import split_func_name
import numpy as np
import random
import scipy
from scipy import *
import scipy.spatial
import sklearn.metrics
import xgboost
from sklearn import cluster
import statsmodels
import keras
import gensim

In [9]:
LIB = gensim

In [10]:
with open('/projects/bdata/jupyter/decision_points/{}.txt'.format(LIB.__name__),'r') as f:
    dp = f.read().split('\n')
    dp = [l for l in dp if l]

In [13]:
vocab = []
for f in dp:
    tokens = split_func_name(f)
    for t in tokens:
        if t not in vocab:
            vocab.append(t)
w2i = {w:i for i, w in enumerate(vocab)}

In [16]:
func2vector = {}
for f in dp:
    func2vector[f] = np.zeros(len(vocab))
    tokens = split_func_name(f)
    for i, t in enumerate(tokens):
        func2vector[f][w2i[t]]=1*i/len(tokens)

In [17]:
vectors = np.stack([func2vector[f] for f in dp])

In [18]:
sim_mat = sklearn.metrics.pairwise.cosine_similarity(vectors)

In [20]:
sorted_mat = np.argsort(sim_mat, axis = 1)

In [21]:
sorted_mat[:,-10:]

array([[108,   3,   4, ...,   5,  12,   0],
       [ 10,   8,   6, ...,   2,  13,   1],
       [ 10,   8,   6, ...,   1,  13,   2],
       ...,
       [106, 108, 107, ..., 112, 118, 117],
       [106, 108, 107, ..., 112, 117, 118],
       [106, 108, 107, ..., 118, 117, 119]])

In [22]:
dist = 1-sim_mat

In [24]:
model = cluster.AgglomerativeClustering(n_clusters = None, distance_threshold=0.85, affinity="precomputed", linkage="average").fit(dist)

In [25]:
model.n_clusters_

24

In [26]:
new_clusters = {}
for i, l in enumerate(model.labels_):
    if l not in new_clusters:
        new_clusters[l] = []
    new_clusters[l].append(dp[i])
clusters = list(new_clusters.values())

In [28]:
for c in clusters:
    print('\n')
    for f in c:
        print(f)



gensim.corpora.BleiCorpus
gensim.corpora.IndexedCorpus
gensim.corpora.LowCorpus
gensim.corpora.MalletCorpus
gensim.corpora.MmCorpus
gensim.corpora.SvmLightCorpus
gensim.corpora.TextCorpus
gensim.corpora.TextDirectoryCorpus
gensim.corpora.UciCorpus
gensim.corpora.WikiCorpus
gensim.corpora.bleicorpus.BleiCorpus
gensim.corpora.mmcorpus.MmCorpus
gensim.corpora.ucicorpus.UciCorpus
gensim.corpora.wikicorpus.WikiCorpus
gensim.matutils.Dense2Corpus
gensim.matutils.Scipy2Corpus
gensim.matutils.Sparse2Corpus
gensim.matutils.corpus2csc
gensim.matutils.corpus2dense
gensim.models.word2vec.Text8Corpus
gensim.utils.ClippedCorpus
gensim.utils.SlicedCorpus


gensim.corpora.Dictionary
gensim.corpora.HashDictionary
gensim.corpora.dictionary.Dictionary


gensim.matutils.any2sparse
gensim.matutils.sparse2full
gensim.models.Doc2Vec
gensim.models.Word2Vec
gensim.models.atmodel.construct_author2doc
gensim.models.atmodel.construct_doc2author
gensim.models.doc2vec.Doc2Vec
gensim.models.doc2vec.TaggedDocument


# clustering based on cooccurrence (ineffective) 

In [49]:
with open('./file_funcs.json','r') as f:
    data = json.load(f)

In [50]:
f_map = {}
for file in data:
    for f in data[file]['funcs']:
        if not f.startswith('{}.'.format(LIB.__name__)): continue
        for r in dp:
            if f.startswith(r):
                f_map[f] = r
                break

In [53]:
func2vector = {}
vector_size = len(data)
files = list(data.keys())
err_files = []
sklearn_counter = {}
for i, nb in enumerate(files):
    if i%10000 == 0:
        print('Log: {} notebooks processed'.format(i))
    funcs = []
    if nb in data:
        funcs = data[nb]["funcs"]
        linenos = data[nb]["linenos"]
    funcs = [f_map[func] for func in funcs if func in f_map]
    for func in funcs:
        if func not in func2vector:
            func2vector[func] = np.full((vector_size), 0.01)
        if func not in sklearn_counter:
            sklearn_counter[func]=0
        func2vector[func][i] += 1
        sklearn_counter[func]+=1

Log: 0 notebooks processed
--------------------
--------------------
--------------------
--------------------
--------------------
scipy.stats.rv_discrete
scipy.stats.rv_discrete
scipy.stats.rv_discrete
scipy.stats.rv_discrete
--------------------
--------------------
--------------------
scipy.stats.skew
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
scipy.optimize.brent
--------------------
--------------------
--------------------
--------------------
scipy.optimize.fmin
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------


In [54]:
for f in dp:
    if f not in func2vector:
        func2vector[f] = np.full((vector_size), 0.01)

In [55]:
idx2func = list(func2vector.keys())
func2idx = {f: i for i, f in enumerate(idx2func)}
vectors = [func2vector[f] for f in idx2func]
occur_matrix = np.stack(vectors)
cooccur_matrix = np.dot(occur_matrix, occur_matrix.T)

In [57]:
a = 1-cooccur_matrix/cooccur_matrix.max()

In [59]:
sim_mat = sklearn.metrics.pairwise.cosine_similarity(occur_matrix)

In [63]:
sorted_mat = np.argsort(sim_mat, axis = 1)

In [85]:
# find most similar functions 
# 备用，不影响整个流程
for i, s in enumerate(sorted_mat[:,-10:]):
    print('-'*20)
    for j in s[::-1]:
        if sim_mat[i,j]>=0.44:
            print(idx2func[j])

--------------------
sklearn.feature_extraction.text.TfidfVectorizer
--------------------
sklearn.feature_extraction.text.CountVectorizer
sklearn.naive_bayes.MultinomialNB
sklearn.feature_extraction.text.TfidfTransformer
--------------------
sklearn.linear_model.Log
--------------------
sklearn.svm.SVC
--------------------
sklearn.preprocessing.LabelEncoder
--------------------
sklearn.linear_model.Ridge
sklearn.linear_model.Lasso
--------------------
sklearn.metrics.classification_report
sklearn.metrics.confusion_matrix
--------------------
sklearn.linear_model.Lasso
sklearn.linear_model.Ridge
--------------------
sklearn.metrics.confusion_matrix
sklearn.metrics.classification_report
--------------------
sklearn.metrics.mean_squared_error
--------------------
sklearn.preprocessing.StandardScaler
--------------------
sklearn.neural_network.MLPClassifier
--------------------
sklearn.preprocessing.LabelBinarizer
--------------------
sklearn.cluster.spectral_clustering
sklearn.feature_ext

In [64]:
a = 1- sim_mat

In [66]:
model = cluster.AgglomerativeClustering(n_clusters = None, distance_threshold=0.8, affinity="precomputed", linkage="average").fit(a)





In [67]:
model.n_clusters_

80

In [68]:
clusters = {}
for i, l in enumerate(model.labels_):
    if l not in clusters:
        clusters[l] = []
    clusters[l].append(idx2func[i])
clusters = list(clusters.values())

In [70]:
for c in clusters:
    print('-'*20)
#     if 'sklearn.linear_model.Ridge' in c:
    for f in c:
        print(f)

--------------------
scipy.stats.rv_discrete
--------------------
scipy.stats.skew
scipy.stats.kurtosis
--------------------
scipy.optimize.brent
scipy.optimize.minimize
scipy.optimize.newton
scipy.optimize.root
scipy.optimize.fixed_point
scipy.optimize.rosen
--------------------
scipy.optimize.fmin
scipy.misc.face
scipy.linalg.det
scipy.optimize.fsolve
scipy.optimize.brute
scipy.optimize.basinhopping
scipy.stats.scoreatpercentile
--------------------
scipy.stats.probplot
--------------------
scipy.stats.chisquare
--------------------
scipy.stats.linregress
scipy.stats.spearmanr
--------------------
scipy.integrate.ode
--------------------
scipy.stats.mannwhitneyu
--------------------
scipy.integrate.quad
--------------------
scipy.stats.pearsonr
--------------------
scipy.sparse.diags
--------------------
scipy.sparse.hstack
scipy.sparse.csr_matrix
--------------------
scipy.stats.describe
--------------------
scipy.optimize.curve_fit
--------------------
scipy.stats.mode
------------