## Import Library

In [1]:
import json
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this 
import matplotlib.pyplot as plt
%matplotlib inline

  from collections import Iterable
  @attr.s(cmp=False, hash=False)


## Load Data 

variable:  data  format: {repo1: [...] , repo2: [...]  }


In [2]:
input_file ='../data/result_13k.json'
with open(input_file) as f:
    data = json.load(f)
print(f'repos nums: {len(data)}')

need_to_remove = []
for k,v in data.items():
    if 'No dependency' in v:
        need_to_remove.append(k)
print(f'repos which have no dependency files: {len(need_to_remove)}')

for k in need_to_remove:
    del data[k]
print(f'repos with dependency files: {len(data)}')

repos nums: 3012
repos which have no dependency files: 0
repos with dependency files: 3012


## PreProcessing
1. Get distinct dependency file dictionary -> dep_dict
2. Sort dependency file  by frequency.     -> sort_dep

In [3]:
rep_list,dep_list = [],[]
for k,v in data.items():
    rep_list.append(k)
    dep_list.append(v)
    
dep_dict = {}
for deps in data.values():
    for i in deps:
        dep_dict[i] = dep_dict.get(i,0)+1

print(f'Distinct dependency file: {len(dep_dict)}',end='\n\n')
        
sort_dep = sorted(dep_dict.items(),key=lambda x: x[-1], reverse=True)
print(f'Frequent dependency files:  {sort_dep[:20]}')

Distinct dependency file: 15663

Frequent dependency files:  [('numpy', 1536), ('scipy', 982), ('matplotlib', 881), ('pandas', 840), ('pytest', 598), ('sphinx', 433), ('requests', 389), ('six', 339), ('scikit-learn', 296), ('pyyaml', 284), ('pytest-cov', 270), ('tqdm', 268), ('sphinx-rtd-theme', 251), ('ipython', 236), ('coverage', 224), ('python-dateutil', 224), ('seaborn', 205), ('h5py', 197), ('pytz', 197), ('jinja2', 191)]


## Word Embedding
1. Build dependency file index table ->  id2word  format: {0: 'ipython',1: 'jupyter-sphinx',....}
2. Build Corpus   -> corpus    format: [[repo1 info],   [repo2 info]   ]  repo1_info:  [ (dep1_id:fre), xxx  ]
3. Build a LDA model

In [4]:
# based on dep file names , build dep name dictionary
id2word  = corpora.Dictionary(list(data.values()))   # {0: 'emd-signal',1: 'numpy', 2: 'SQLAlchemy', 3: 'aiofiles' ....}

# based on dep name dict and dep names, build corpus
corpus = [id2word.doc2bow(text) for text in list(data.values())] # [[(0, 1), (1, 1)],.....]


print(f'repos info: {corpus[0]}')    #(0, 1)

[[(id2word[id_], freq) for id_, freq in cp] for cp in corpus[:1]]

repos info: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]


[[('ipython', 1),
  ('jupyter-sphinx', 1),
  ('nbformat', 1),
  ('nbsphinx', 1),
  ('path-py', 1),
  ('six', 1),
  ('sphinx', 1),
  ('sphinx-hoverxref', 1),
  ('sphinx-rtd-theme', 1)]]

## Build LDA model

In [12]:
%%time
clusters = list(range(2,30))
for k in clusters:   
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=k, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    # pprint(lda_model.print_topics())
    # Compute Perplexity and Coherence Score
    Perplexity = lda_model.log_perplexity(corpus)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=list(data.values()), dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('cluster number:',k, 'Coherence Score',coherence_lda,'Perplexity:',Perplexity)
    print('='*55)

cluster number: 2 Coherence Score 0.7168172832060531 Perplexity: -8.237998902231151
cluster number: 3 Coherence Score 0.615690246805626 Perplexity: -8.071388752843317
cluster number: 4 Coherence Score 0.6636795587837757 Perplexity: -8.034621326643972
cluster number: 5 Coherence Score 0.7663406466626557 Perplexity: -8.028612683211241
cluster number: 6 Coherence Score 0.7030085159612155 Perplexity: -8.131020966388451
cluster number: 7 Coherence Score 0.6827045068329411 Perplexity: -8.209558886377186
cluster number: 8 Coherence Score 0.6953147529879 Perplexity: -8.36912075711036
cluster number: 9 Coherence Score 0.6804980018623574 Perplexity: -8.561414175700984
cluster number: 10 Coherence Score 0.6355667452627967 Perplexity: -8.880673253546929
cluster number: 11 Coherence Score 0.5789351789045056 Perplexity: -9.16700800509858
cluster number: 12 Coherence Score 0.5924426651940251 Perplexity: -9.806881016067035
cluster number: 13 Coherence Score 0.6877296377135378 Perplexity: -10.577434744

=> result: cluster number: 15 Coherence Score 0.7210851908924468 Perplexity: -12.796768596728718

In [13]:
# use k = 15 to tain LDA
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=15, 
                                       random_state=100,
                                       update_every=1,
                                       chunksize=100,
                                       passes=10,
                                       alpha='auto',
                                    per_word_topics=True)

In [14]:
# Show the top 5 words of each topic
for topic in lda_model.print_topics(num_words=5):
    print(topic)

(0, '0.033*"style-loader" + 0.030*"url-loader" + 0.026*"webpack" + 0.026*"css-loader" + 0.023*"webpack-cli"')
(1, '0.011*"@babel/core" + 0.010*"@babel/preset-env" + 0.007*"@babel/helper-module-imports" + 0.007*"@babel/parser" + 0.007*"@babel/helper-function-name"')
(2, '0.064*"kiwisolver" + 0.033*"selenium" + 0.028*"pycodestyle" + 0.027*"recommonmark" + 0.026*"babel"')
(3, '0.026*"pyqt5" + 0.017*"sass" + 0.014*"coffee-script" + 0.013*"i18n" + 0.012*"rake"')
(4, '0.073*"coverage" + 0.072*"pytest" + 0.071*"pytest-cov" + 0.063*"nose" + 0.046*"sphinx"')
(5, '0.157*"numpy" + 0.106*"scipy" + 0.096*"matplotlib" + 0.091*"pandas" + 0.037*"pytest"')
(6, '0.041*"org.apache.maven.plugins:maven-compiler-plugin" + 0.037*"junit:junit" + 0.026*"org.apache.maven.plugins:maven-jar-plugin" + 0.023*"org.apache.maven.plugins:maven-surefire-plugin" + 0.023*"org.apache.maven.plugins:maven-javadoc-plugin"')
(7, '0.039*"setuptools" + 0.023*"decorator" + 0.019*"jinja2" + 0.018*"coveralls" + 0.018*"cycler"')
(8,

In [15]:
# get the possible of each topic
lda_model.inference(corpus)[0].shape,lda_model.inference(corpus)[0]

((3012, 15),
 array([[0.04461605, 0.02198247, 1.0795138 , ..., 0.0294043 , 0.03900352,
         0.24037395],
        [0.04461605, 0.02198247, 0.06773023, ..., 0.0294043 , 0.03900352,
         1.1452159 ],
        [0.04461605, 0.02198247, 0.06773023, ..., 0.0294043 , 4.0388503 ,
         2.4690583 ],
        ...,
        [0.04461605, 0.02198247, 0.06773023, ..., 0.0294043 , 0.03900352,
         0.22609688],
        [0.04461605, 0.02198247, 0.06773023, ..., 0.0294043 , 0.03900352,
         0.22609688],
        [0.04461605, 0.02198247, 0.06773023, ..., 0.0294043 , 0.03900352,
         0.22609733]], dtype=float32))

In [16]:
# make inference (cluster) for each repo
print('inference ：')
topic_dict = {}
for e, values in enumerate(lda_model.inference(corpus)[0]):
    topic_val = 0
    topic_id = 0
    for tid, val in enumerate(values):
        if val > topic_val:
            topic_val = val
            topic_id = tid
            
    topic_dict[topic_id] = topic_dict.get(topic_id,[])
    topic_dict[topic_id].append(rep_list[e])
    print(topic_id, '--->', rep_list[e])

inference ：
7 ---> https://github.com/AgriculturalModelExchangeInitiative/Crop2ML
14 ---> https://github.com/natcap/natgeo-dams
5 ---> https://github.com/hlgirard/CrystalML
5 ---> https://github.com/houghb/ligpy
7 ---> https://github.com/oschwengers/referenceseeker
5 ---> https://github.com/LPDI-EPFL/trivalent_cocktail
5 ---> https://github.com/CIRA-Pulsars-and-Transients-Group/vcstools
4 ---> https://github.com/usc-isi-i2/kgtk
5 ---> https://github.com/garciagenrique/template_project_escape
2 ---> https://github.com/gwu-libraries/sfm-docker
6 ---> https://github.com/javaparser/javaparser
5 ---> https://github.com/williamjameshandley/anesthetic
14 ---> https://github.com/yardencsGitHub/tweetynet
5 ---> https://github.com/jagalindo/A-Python-QX-implementation
5 ---> https://github.com/jbkinney/mavenn
5 ---> https://github.com/GeoCode-polymtl/Seis_float16
5 ---> https://github.com/sebp/scikit-survival
13 ---> https://github.com/similitude/sumo-simmer
12 ---> https://github.com/GlobalNames

5 ---> https://github.com/nano-sippe/dispersion
5 ---> https://github.com/hallamlab/pathway2vec
5 ---> https://github.com/SCM-NV/nano-qmflows
5 ---> https://github.com/bburan/NeuroBehavior
3 ---> https://github.com/PhenixCollaboration/web
14 ---> https://github.com/Bubblbu/crawling-framework
14 ---> https://github.com/smarr/ReBench
5 ---> https://github.com/Alerovere/Paleo-SL-utilities
5 ---> https://github.com/TUDelft-CITG/OpenCLSim
5 ---> https://github.com/RMeli/spyrmsd
14 ---> https://github.com/dejac001/RealGas
6 ---> https://github.com/SciCrunch/resource_disambiguator
5 ---> https://github.com/rraadd88/htsimaging
5 ---> https://github.com/etiennebresciani/wellradpy
4 ---> https://github.com/blsqr/paramspace
5 ---> https://github.com/FInAT/FInAT
5 ---> https://github.com/hovo1990/GROM
5 ---> https://github.com/scikit-learn/scikit-learn
5 ---> https://github.com/covid-lncc/pydemic
5 ---> https://github.com/pvlib/pvlib-python
6 ---> https://github.com/phenoscape/phenoday-reasoning-p

5 ---> https://github.com/DLR-SC/gitlab2prov
4 ---> https://github.com/openearth/aeolis-python
9 ---> https://github.com/Lonero-Team/Decentralized-Internet
14 ---> https://github.com/LegoStormtroopr/django-spaghetti-and-meatballs
5 ---> https://github.com/smarie/python-azureml-client
5 ---> https://github.com/NREL/floris
14 ---> https://github.com/ADicksonLab/wepy
5 ---> https://github.com/M4I-nanoscopy/tpx3-event-localisation
3 ---> https://github.com/myGrid/ruby-ucf
5 ---> https://github.com/andsor/pyfssa
4 ---> https://github.com/arkottke/pykoom
5 ---> https://github.com/tardis-sn/tardis
5 ---> https://github.com/IaPCS/gmsh-exodus-converter
5 ---> https://github.com/atait/lytest
5 ---> https://github.com/ocsmit/rindcalc
8 ---> https://github.com/mpostol/TP
8 ---> https://github.com/ElektraInitiative/libelektra
7 ---> https://github.com/will-rowe/groot
10 ---> https://github.com/bexis/Module_LUI
5 ---> https://github.com/benjaminrose/MC-Age
5 ---> https://github.com/SSW-DataLab/rando

14 ---> https://github.com/boutiques/boutiques
5 ---> https://github.com/nansencenter/nansat
4 ---> https://github.com/cdanielmachado/reframed
7 ---> https://github.com/LightForm-group/matflow
14 ---> https://github.com/Capitains/Hook
5 ---> https://github.com/arokem/ISBI2015
5 ---> https://github.com/a-slide/NanoSnake
5 ---> https://github.com/laplizard/infoplot
5 ---> https://github.com/hugadams/PAME
5 ---> https://github.com/torressa/cspy
5 ---> https://github.com/ConservationInternational/trends.earth
4 ---> https://github.com/OpenChemistry/avogadrolibs
5 ---> https://github.com/ondrolexa/pywerami
1 ---> https://github.com/Sulstice/datacity
7 ---> https://github.com/bootphon/phonemizer
5 ---> https://github.com/chrisgorgo/alleninf
12 ---> https://github.com/opentox/lazar-rest
6 ---> https://github.com/SeqWare/seqware
5 ---> https://github.com/msmbuilder/osprey
14 ---> https://github.com/urschrei/CDP
7 ---> https://github.com/comic/grand-challenge.org
5 ---> https://github.com/rolan

5 ---> https://github.com/MicroPasts/EgyptExplorationSocBuhenPottery
5 ---> https://github.com/luispedro/imread
14 ---> https://github.com/PCMSolver/pcmsolver
6 ---> https://github.com/klout/brickhouse
8 ---> https://github.com/Dash-Industry-Forum/dash.js
5 ---> https://github.com/ProjectDrawdown/spatial-aez
8 ---> https://github.com/cytoscape/cytoscape.js-popper
5 ---> https://github.com/ecohealthalliance/pubcrawler
5 ---> https://github.com/underworldcode/stripy
5 ---> https://github.com/pycroscopy/pyUSID
5 ---> https://github.com/jjnp/dss20-ue1
5 ---> https://github.com/pytroll/satpy
6 ---> https://github.com/psambit9791/jDSP
5 ---> https://github.com/pypeit/PypeIt
5 ---> https://github.com/MPI-Dortmund/sphire_classes_autoselect
5 ---> https://github.com/PaulScotti/educortex
2 ---> https://github.com/adaerr/pendent-drop
5 ---> https://github.com/simpeg-research/Astic-2020-JointInversion
6 ---> https://github.com/cgvwzq/polca
6 ---> https://github.com/davidenunes/jnetwork
14 ---> htt

6 ---> https://github.com/VIDA-NYU/domain-discovery-d4
14 ---> https://github.com/SUNCAT-Center/CatLearn
5 ---> https://github.com/fastread/src
7 ---> https://github.com/lanecodes/cymod
5 ---> https://github.com/mancellin/capytaine
6 ---> https://github.com/tferr/Scripts
5 ---> https://github.com/ostwalprasad/LGNpy
5 ---> https://github.com/AstraZeneca-NGS/simple_sv_annotation
10 ---> https://github.com/sept08/WebToys
5 ---> https://github.com/JiaweiZhuang/ipm_util
5 ---> https://github.com/SynthSys/pyOmeroUpload
5 ---> https://github.com/adamewing/tldr
9 ---> https://github.com/JustinGOSSES/wellio.js
5 ---> https://github.com/purdue-cap/DryadSynth
8 ---> https://github.com/ph463/Gygax
6 ---> https://github.com/mayconbordin/cdr-gen
5 ---> https://github.com/Axelrod-Python/Axelrod
4 ---> https://github.com/MIPT-Oulu/solt
5 ---> https://github.com/hibernator11/notebook-texts-metadata
6 ---> https://github.com/nismod/transport
5 ---> https://github.com/scikit-hep/uproot
7 ---> https://git

In [18]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'lda1.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
