In [1]:
import json
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this 
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
input_file ='result_spark_2.json'
with open(input_file) as f:
    data = json.load(f)
print(f'repos nums: {len(data)}')

need_to_remove = []
for k,v in data.items():
    if 'No dependency' in v:
        need_to_remove.append(k)
print(f'repos which have no dependency files: {len(need_to_remove)}')


for k in need_to_remove:
    del data[k]
print(f'repos with dependency files: {len(data)}')

repos nums: 13796
repos which have no dependency files: 12297
repos with dependency files: 1499


In [4]:
rep_list,dep_list = [],[]
for k,v in data.items():
    rep_list.append(k)
    dep_list.append(v)

In [5]:
dep_dict = {}
for deps in data.values():
    for i in deps:
        dep_dict[i] = dep_dict.get(i,0)+1

print(f'distinct dependency file name: {len(dep_dict)}')
        
sort_dep = sorted(dep_dict.items(),key=lambda x: x[-1], reverse=True)
print(f'frequent dependency files:  {sort_dep[:20]}')


distinct dependency file name: 11238
frequent dependency files:  [('numpy', 764), ('scipy', 494), ('matplotlib', 453), ('pandas', 416), ('pytest', 271), ('requests', 218), ('sphinx', 203), ('six', 167), ('scikit-learn', 159), ('pyyaml', 158), ('tqdm', 147), ('pytest-cov', 123), ('python-dateutil', 112), ('ipython', 111), ('sphinx-rtd-theme', 107), ('pytz', 101), ('pillow', 101), ('seaborn', 101), ('coverage', 100), ('h5py', 100)]


In [6]:
# based on dep file names , build dep name dictionary
id2word  = corpora.Dictionary(list(data.values()))   # {0: 'emd-signal',1: 'numpy', 2: 'SQLAlchemy', 3: 'aiofiles' ....}

# based on dep name dict and dep names, build corpus
corpus = [id2word.doc2bow(text) for text in list(data.values())] # [[(0, 1), (1, 1)],.....]

# Human readable format of corpus (term-frequency)
# [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]
corpus[0],[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

([(0, 1), (1, 1)], [[('emd-signal', 1), ('numpy', 1)]])

In [7]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

pprint(lda_model.print_topics())

[(0,
  '0.061*"junit:junit" + '
  '0.044*"org.apache.maven.plugins:maven-compiler-plugin" + '
  '0.027*"org.apache.maven.plugins:maven-jar-plugin" + '
  '0.023*"org.apache.maven.plugins:maven-surefire-plugin" + '
  '0.021*"org.apache.maven.plugins:maven-javadoc-plugin" + '
  '0.020*"org.apache.maven.plugins:maven-source-plugin" + '
  '0.019*"org.slf4j:slf4j-api" + 0.018*"com.google.guava:guava" + '
  '0.017*"commons-io:commons-io" + 0.016*"ch.qos.logback:logback-classic"'),
 (1,
  '0.061*"nose" + 0.035*"netcdf4" + 0.009*"babel-plugin-transform-strict-mode" '
  '+ 0.009*"babel-plugin-transform-es2015-modules-commonjs" + '
  '0.009*"babel-plugin-transform-es2015-modules-amd" + '
  '0.009*"babel-preset-env" + 0.008*"babel-register" + '
  '0.008*"babel-plugin-transform-es2015-classes" + '
  '0.008*"babel-plugin-transform-es2015-block-scoping" + '
  '0.008*"babel-plugin-transform-es2015-block-scoped-functions"'),
 (2,
  '0.013*"obspy" + 0.011*"System.ValueTuple" + 0.000*"MSTest.TestFramewor

In [8]:
# Compute Perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=list(data.values()), dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Perplexity:  -14.8664302118617
Coherence Score:  0.6390475117702155


In [9]:
# Show the top 5 words of each topic
for topic in lda_model.print_topics(num_words=5):
    print(topic)

(0, '0.061*"junit:junit" + 0.044*"org.apache.maven.plugins:maven-compiler-plugin" + 0.027*"org.apache.maven.plugins:maven-jar-plugin" + 0.023*"org.apache.maven.plugins:maven-surefire-plugin" + 0.021*"org.apache.maven.plugins:maven-javadoc-plugin"')
(1, '0.061*"nose" + 0.035*"netcdf4" + 0.009*"babel-plugin-transform-strict-mode" + 0.009*"babel-plugin-transform-es2015-modules-commonjs" + 0.009*"babel-plugin-transform-es2015-modules-amd"')
(2, '0.013*"obspy" + 0.011*"System.ValueTuple" + 0.000*"MSTest.TestAdapter" + 0.000*"MSTest.TestFramework" + 0.000*"System.Collections"')
(3, '0.011*"fs.realpath" + 0.011*"has-flag" + 0.011*"glob" + 0.010*"graceful-fs" + 0.010*"inflight"')
(4, '0.009*"@babel/preset-env" + 0.009*"react" + 0.009*"@babel/core" + 0.008*"react-dom" + 0.008*"@babel/plugin-transform-modules-commonjs"')
(5, '0.046*"future" + 0.027*"pyproj" + 0.024*"sqlalchemy" + 0.020*"nltk" + 0.017*"django"')
(6, '0.031*"pyqt5" + 0.025*"jekyll" + 0.019*"log-symbols" + 0.017*"nibabel" + 0.016*"

In [10]:
# make inference (cluster) for each repo
print('inference ：')
for e, values in enumerate(lda_model.inference(corpus)[0]):
    topic_val = 0
    topic_id = 0
    for tid, val in enumerate(values):
        if val > topic_val:
            topic_val = val
            topic_id = tid
    print(topic_id, '--->', rep_list[e])

inference ：
7 ---> https://github.com/LRydin/MFDFA
5 ---> https://github.com/Flowminder/FlowKit
7 ---> https://github.com/linhd-postdata/desir
4 ---> https://github.com/DLR-SC/gitlab2prov
7 ---> https://github.com/openearth/aeolis-python
8 ---> https://github.com/Lonero-Team/Decentralized-Internet
5 ---> https://github.com/LegoStormtroopr/django-spaghetti-and-meatballs
7 ---> https://github.com/smarie/python-azureml-client
7 ---> https://github.com/NREL/floris
7 ---> https://github.com/ADicksonLab/wepy
7 ---> https://github.com/M4I-nanoscopy/tpx3-event-localisation
17 ---> https://github.com/myGrid/ruby-ucf
7 ---> https://github.com/andsor/pyfssa
7 ---> https://github.com/arkottke/pykoom
10 ---> https://github.com/tardis-sn/tardis
7 ---> https://github.com/IaPCS/gmsh-exodus-converter
7 ---> https://github.com/atait/lytest
7 ---> https://github.com/ocsmit/rindcalc
5 ---> https://github.com/mpostol/TP
8 ---> https://github.com/ElektraInitiative/libelektra
7 ---> https://github.com/will-r

7 ---> https://github.com/feelpp/feelpp
7 ---> https://github.com/seclab-ucr/UBITect
7 ---> https://github.com/SuLab/scheduled-bots
7 ---> https://github.com/mjlaine/eppes
7 ---> https://github.com/alpha-xone/xbbg
7 ---> https://github.com/IA-Cardiologia-husa/VHD_NLP
10 ---> https://github.com/ahamilton144/hamilton-2020-managing-financial-risk-tradeoffs-for-hydropower
8 ---> https://github.com/MonashBioinformaticsPlatform/laxy
7 ---> https://github.com/Ptrskay3/PySprint
7 ---> https://github.com/gaojun0816/code_access_finder
8 ---> https://github.com/LinkedDataFragments/Server.js
8 ---> https://github.com/lgsvl/simulator
7 ---> https://github.com/foerstner-lab/GRADitude
14 ---> https://github.com/BritishMuseumDH/britishMuseumFacesDetection
7 ---> https://github.com/mattpitkin/psrqpy
7 ---> https://github.com/cbirdferrer/collatrix
7 ---> https://github.com/spacetelescope/pysiaf
7 ---> https://github.com/jag1g13/pycgtool
4 ---> https://github.com/jtmccr1/figtreejs-react
7 ---> https://gi

7 ---> https://github.com/fpavogt/fcmaker
8 ---> https://github.com/falafeljan/recogito-user-testing
7 ---> https://github.com/CalebBell/fluids
7 ---> https://github.com/ImperialCollegeLondon/acoustics-db
19 ---> https://github.com/mfroeling/QMRITools
7 ---> https://github.com/tqbl/ood_audio
4 ---> https://github.com/delphi-hub/delphi-webapp
7 ---> https://github.com/taishi-i/nagisa
7 ---> https://github.com/slimgroup/Azure2019
8 ---> https://github.com/afilipanog/afilipanog.github.io
7 ---> https://github.com/samapriya/Sat-Pipeline-CLI
0 ---> https://github.com/shah314/graphcoloring
7 ---> https://github.com/YeoLab/gscripts
17 ---> https://github.com/FitzwilliamMuseum/ahrc-linking-islands
7 ---> https://github.com/cern-fts/webfts
10 ---> https://github.com/brinkmanlab/feature_merge
19 ---> https://github.com/dohalloran/phylo-node
7 ---> https://github.com/IMTtugraz/PyQMRI
7 ---> https://github.com/openearth/aeolis
10 ---> https://github.com/obreitwi/py-veer
14 ---> https://github.com/

7 ---> https://github.com/ngcrawford/CloudForest
8 ---> https://github.com/sdrammis/Friedman-Hueske-2020
17 ---> https://github.com/trustedci/OSCRP
7 ---> https://github.com/pyxem/orix
0 ---> https://github.com/thorstenwagner/ij-nl-means
7 ---> https://github.com/BartoszBartmanski/StoSpa2
7 ---> https://github.com/bartongroup/yanosim
10 ---> https://github.com/ebmdatalab/covid_trials_tracker-covid
7 ---> https://github.com/pescadores/pescador
7 ---> https://github.com/annayqho/TheCannon
0 ---> https://github.com/jyrkioraskari/IFCtoLBD
7 ---> https://github.com/jdiasn/McRadar
0 ---> https://github.com/lifs-tools/jgoslin
7 ---> https://github.com/cmbant/getdist
7 ---> https://github.com/CESNET/perun-simplesamlphp-module
8 ---> https://github.com/gasparl/citapp_mobile
10 ---> https://github.com/TobiasWeigel/lapis
8 ---> https://github.com/devopsbase/devopsbase
17 ---> https://github.com/hdcaicyt/Relacion-de-un-viaje
7 ---> https://github.com/epcarraway/capstone
14 ---> https://github.com/

In [11]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'lda.html')