## Import Library

In [1]:
import json
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this 
import matplotlib.pyplot as plt
%matplotlib inline

## Load Data 

variable:  data  format: {repo1: [...] , repo2: [...]  }


In [2]:
input_file ='../data/result_13k.json'
with open(input_file) as f:
    data = json.load(f)
print(f'repos nums: {len(data)}')

need_to_remove = []
for k,v in data.items():
    if 'No dependency' in v:
        need_to_remove.append(k)
print(f'repos which have no dependency files: {len(need_to_remove)}')

for k in need_to_remove:
    del data[k]
print(f'repos with dependency files: {len(data)}')

repos nums: 3012
repos which have no dependency files: 0
repos with dependency files: 3012


## PreProcessing
1. Get distinct dependency file dictionary -> dep_dict
2. Sort dependency file  by frequency.     -> sort_dep

In [3]:
rep_list,dep_list = [],[]
for k,v in data.items():
    rep_list.append(k)
    dep_list.append(v)
    
dep_dict = {}
for deps in data.values():
    for i in deps:
        dep_dict[i] = dep_dict.get(i,0)+1

print(f'Distinct dependency file: {len(dep_dict)}',end='\n\n')
        
sort_dep = sorted(dep_dict.items(),key=lambda x: x[-1], reverse=True)
print(f'Frequent dependency files:  {sort_dep[:20]}')

Distinct dependency file: 15663

Frequent dependency files:  [('numpy', 1536), ('scipy', 982), ('matplotlib', 881), ('pandas', 840), ('pytest', 598), ('sphinx', 433), ('requests', 389), ('six', 339), ('scikit-learn', 296), ('pyyaml', 284), ('pytest-cov', 270), ('tqdm', 268), ('sphinx-rtd-theme', 251), ('ipython', 236), ('coverage', 224), ('python-dateutil', 224), ('seaborn', 205), ('h5py', 197), ('pytz', 197), ('jinja2', 191)]


## Word Embedding
1. Build dependency file index table ->  id2word  format: {0: 'ipython',1: 'jupyter-sphinx',....}
2. Build Corpus   -> corpus    format: [[repo1 info],   [repo2 info]   ]  repo1_info:  [ (dep1_id:fre), xxx  ]
3. Build a LDA model

In [4]:
# based on dep file names , build dep name dictionary
id2word  = corpora.Dictionary(list(data.values()))   # {0: 'emd-signal',1: 'numpy', 2: 'SQLAlchemy', 3: 'aiofiles' ....}

# based on dep name dict and dep names, build corpus
corpus = [id2word.doc2bow(text) for text in list(data.values())] # [[(0, 1), (1, 1)],.....]


print(f'repos info: {corpus[0]}')    #(0, 1)

[[(id2word[id_], freq) for id_, freq in cp] for cp in corpus[:1]]

repos info: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]


[[('ipython', 1),
  ('jupyter-sphinx', 1),
  ('nbformat', 1),
  ('nbsphinx', 1),
  ('path-py', 1),
  ('six', 1),
  ('sphinx', 1),
  ('sphinx-hoverxref', 1),
  ('sphinx-rtd-theme', 1)]]

## Build LDA model

In [5]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
pprint(lda_model.print_topics())

[(0,
  '0.013*"flask-sqlalchemy" + 0.011*"flask-login" + '
  '0.008*"com.beust:jcommander" + 0.006*"cors" + 0.006*"flask-script" + '
  '0.006*"flask-migrate" + 0.006*"wtforms" + 0.006*"@types/jest" + '
  '0.005*"flask-wtf" + 0.005*"blinker"'),
 (1,
  '0.026*"beautifulsoup4" + 0.026*"dask" + 0.023*"plotly" + 0.012*"sass" + '
  '0.010*"pyasn1" + 0.010*"coffee-script" + 0.010*"docopt" + '
  '0.008*"addressable" + 0.008*"json5" + 0.008*"ffi"'),
 (2,
  '0.044*"kiwisolver" + 0.024*"com.google.code.gson:gson" + 0.023*"selenium" + '
  '0.019*"pycodestyle" + 0.018*"babel" + 0.018*"alabaster" + 0.016*"q" + '
  '0.016*"snowballstemmer" + 0.014*"imagesize" + 0.011*"pyflakes"'),
 (3,
  '0.105*"numpy" + 0.071*"scipy" + 0.064*"matplotlib" + 0.060*"pandas" + '
  '0.040*"pytest" + 0.023*"sphinx" + 0.021*"scikit-learn" + 0.016*"seaborn" + '
  '0.016*"pytest-cov" + 0.015*"setuptools"'),
 (4,
  '0.011*"semver" + 0.010*"js-yaml" + 0.009*"tox" + 0.008*"inherits" + '
  '0.008*"graphviz" + 0.007*"inflight" + 

In [6]:
# Compute Perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=list(data.values()), dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Perplexity:  -8.880674283106055
Coherence Score:  0.6355667452627967


In [7]:
# Show the top 5 words of each topic
for topic in lda_model.print_topics(num_words=5):
    print(topic)

(0, '0.013*"flask-sqlalchemy" + 0.011*"flask-login" + 0.008*"com.beust:jcommander" + 0.006*"cors" + 0.006*"flask-script"')
(1, '0.026*"beautifulsoup4" + 0.026*"dask" + 0.023*"plotly" + 0.012*"sass" + 0.010*"pyasn1"')
(2, '0.044*"kiwisolver" + 0.024*"com.google.code.gson:gson" + 0.023*"selenium" + 0.019*"pycodestyle" + 0.018*"babel"')
(3, '0.105*"numpy" + 0.071*"scipy" + 0.064*"matplotlib" + 0.060*"pandas" + 0.040*"pytest"')
(4, '0.011*"semver" + 0.010*"js-yaml" + 0.009*"tox" + 0.008*"inherits" + 0.008*"graphviz"')
(5, '0.054*"requests" + 0.020*"pyyaml" + 0.019*"certifi" + 0.017*"six" + 0.017*"jinja2"')
(6, '0.022*"org.apache.maven.plugins:maven-compiler-plugin" + 0.020*"junit:junit" + 0.014*"org.apache.maven.plugins:maven-jar-plugin" + 0.013*"org.apache.maven.plugins:maven-surefire-plugin" + 0.012*"org.apache.maven.plugins:maven-javadoc-plugin"')
(7, '0.020*"python-dateutil" + 0.015*"markupsafe" + 0.015*"pyparsing" + 0.015*"decorator" + 0.014*"cycler"')
(8, '0.023*"cartopy" + 0.019*"es

In [8]:
# get the possible of each topic
lda_model.inference(corpus)[0].shape,lda_model.inference(corpus)[0]

((3012, 10),
 array([[0.01560236, 0.07957503, 0.06580058, ..., 0.20069756, 0.0221843 ,
         0.08951548],
        [0.01560236, 0.07957509, 0.06580059, ..., 0.19480723, 0.0221843 ,
         0.08951551],
        [0.01560236, 4.079008  , 0.06580058, ..., 2.8628008 , 0.0221843 ,
         0.08951548],
        ...,
        [0.01560236, 0.07957502, 0.06580058, ..., 0.19474855, 0.0221843 ,
         0.08951548],
        [0.01560236, 0.07957502, 0.06580058, ..., 0.19474855, 0.0221843 ,
         0.08951548],
        [0.01560236, 0.07957502, 0.06580058, ..., 0.19474855, 0.0221843 ,
         0.08951548]], dtype=float32))

In [9]:
# make inference (cluster) for each repo
print('inference ：')
topic_dict = {}
for e, values in enumerate(lda_model.inference(corpus)[0]):
    topic_val = 0
    topic_id = 0
    for tid, val in enumerate(values):
        if val > topic_val:
            topic_val = val
            topic_id = tid
            
    topic_dict[topic_id] = topic_dict.get(topic_id,[])
    topic_dict[topic_id].append(rep_list[e])
    print(topic_id, '--->', rep_list[e])

inference ：
3 ---> https://github.com/AgriculturalModelExchangeInitiative/Crop2ML
5 ---> https://github.com/natcap/natgeo-dams
3 ---> https://github.com/hlgirard/CrystalML
3 ---> https://github.com/houghb/ligpy
3 ---> https://github.com/oschwengers/referenceseeker
3 ---> https://github.com/LPDI-EPFL/trivalent_cocktail
3 ---> https://github.com/CIRA-Pulsars-and-Transients-Group/vcstools
3 ---> https://github.com/usc-isi-i2/kgtk
3 ---> https://github.com/garciagenrique/template_project_escape
2 ---> https://github.com/gwu-libraries/sfm-docker
6 ---> https://github.com/javaparser/javaparser
3 ---> https://github.com/williamjameshandley/anesthetic
3 ---> https://github.com/yardencsGitHub/tweetynet
3 ---> https://github.com/jagalindo/A-Python-QX-implementation
3 ---> https://github.com/jbkinney/mavenn
3 ---> https://github.com/GeoCode-polymtl/Seis_float16
3 ---> https://github.com/sebp/scikit-survival
5 ---> https://github.com/similitude/sumo-simmer
6 ---> https://github.com/GlobalNamesArch

4 ---> https://github.com/machawk1/warcreate
3 ---> https://github.com/materialsvirtuallab/monty
3 ---> https://github.com/kgullikson88/Telluric-Fitter
3 ---> https://github.com/Urban-Meteorology-Reading/SUEWS
3 ---> https://github.com/serazing/xscale
3 ---> https://github.com/pauleve/mpbn
3 ---> https://github.com/ericmjl/flu-gibson
9 ---> https://github.com/MoritzStefaner/ach-ingen-zell
5 ---> https://github.com/MAGIC-nexus/nis-backend
3 ---> https://github.com/pycalphad/scheil
5 ---> https://github.com/vanheeringen-lab/genomepy
3 ---> https://github.com/deparkes/OOMMFTools
3 ---> https://github.com/sarisabban/RamaNet
3 ---> https://github.com/nano-sippe/dispersion
3 ---> https://github.com/hallamlab/pathway2vec
3 ---> https://github.com/SCM-NV/nano-qmflows
3 ---> https://github.com/bburan/NeuroBehavior
1 ---> https://github.com/PhenixCollaboration/web
3 ---> https://github.com/Bubblbu/crawling-framework
5 ---> https://github.com/smarr/ReBench
3 ---> https://github.com/Alerovere/Pale

3 ---> https://github.com/matt-long/xpersist
3 ---> https://github.com/bast/smeshing
5 ---> https://github.com/Scifabric/pybossa
3 ---> https://github.com/ds-wizard/docs
3 ---> https://github.com/abelcarreras/phonolammps
3 ---> https://github.com/jason-zl190/sisr_medical
3 ---> https://github.com/mdshw5/pyfaidx
3 ---> https://github.com/wiebket/delprocess
3 ---> https://github.com/holoviz/datashader
3 ---> https://github.com/ISA-tools/mzml2isa
5 ---> https://github.com/ajefweiss/HelioSat
3 ---> https://github.com/houghb/savvy
7 ---> https://github.com/fnl/gnamed
3 ---> https://github.com/jjgomera/iapws
6 ---> https://github.com/dicom/rtp-connect
5 ---> https://github.com/moonso/genmod
3 ---> https://github.com/zafarali/emdp
3 ---> https://github.com/phydev/trajpy
3 ---> https://github.com/IMMM-SFA/im3py
5 ---> https://github.com/yadage/yadage-schemas
5 ---> https://github.com/karenadam/Mixed-Bandlimited-Time-Encoding
3 ---> https://github.com/spacetelescope/gwcs
3 ---> https://github.c

3 ---> https://github.com/scikit-hep/boost-histogram
7 ---> https://github.com/vsoch/askci
7 ---> https://github.com/boutiques/boutiques
3 ---> https://github.com/nansencenter/nansat
3 ---> https://github.com/cdanielmachado/reframed
3 ---> https://github.com/LightForm-group/matflow
5 ---> https://github.com/Capitains/Hook
3 ---> https://github.com/arokem/ISBI2015
3 ---> https://github.com/a-slide/NanoSnake
7 ---> https://github.com/laplizard/infoplot
3 ---> https://github.com/hugadams/PAME
3 ---> https://github.com/torressa/cspy
3 ---> https://github.com/ConservationInternational/trends.earth
3 ---> https://github.com/OpenChemistry/avogadrolibs
3 ---> https://github.com/ondrolexa/pywerami
9 ---> https://github.com/Sulstice/datacity
5 ---> https://github.com/bootphon/phonemizer
3 ---> https://github.com/chrisgorgo/alleninf
6 ---> https://github.com/opentox/lazar-rest
6 ---> https://github.com/SeqWare/seqware
3 ---> https://github.com/msmbuilder/osprey
5 ---> https://github.com/urschrei/

7 ---> https://github.com/AndrewIOM/global-pollen-project
3 ---> https://github.com/PySCeS/pysces
3 ---> https://github.com/DamCB/tyssue
5 ---> https://github.com/lsmo-epfl/discover-curated-cofs
7 ---> https://github.com/caltechlibrary/eprints2bags
5 ---> https://github.com/proycon/foliapy
5 ---> https://github.com/bird-house/twitcher
5 ---> https://github.com/ganga-devs/ganga
6 ---> https://github.com/kotik-coder/PULsE
5 ---> https://github.com/mozillazg/pinyin-data
3 ---> https://github.com/MicroPasts/EgyptExplorationSocBuhenPottery
3 ---> https://github.com/luispedro/imread
2 ---> https://github.com/PCMSolver/pcmsolver
6 ---> https://github.com/klout/brickhouse
9 ---> https://github.com/Dash-Industry-Forum/dash.js
3 ---> https://github.com/ProjectDrawdown/spatial-aez
9 ---> https://github.com/cytoscape/cytoscape.js-popper
5 ---> https://github.com/ecohealthalliance/pubcrawler
3 ---> https://github.com/underworldcode/stripy
3 ---> https://github.com/pycroscopy/pyUSID
3 ---> https://g

6 ---> https://github.com/SP7-Ritmare/EDI-NG_server
3 ---> https://github.com/abhi1693/yii2-enum
6 ---> https://github.com/rinde/pdptw-dataset-generator
7 ---> https://github.com/CambridgeSemiticsLab/BH_time_collocations
9 ---> https://github.com/erwinkendo/polaruob
6 ---> https://github.com/geneontology/obographs
8 ---> https://github.com/speckleworks/SpeckleCore
3 ---> https://github.com/rjw57/videosequence
3 ---> https://github.com/tylerjereddy/diffusion_analysis_MD_simulations
3 ---> https://github.com/luphysics/PyMODA
3 ---> https://github.com/NatLibFi/Annif
5 ---> https://github.com/adbar/trafilatura
3 ---> https://github.com/ForeverZyh/DEBAR
3 ---> https://github.com/danchubb/CanVar
3 ---> https://github.com/JonathonMSmith/growin
3 ---> https://github.com/SoftwareDevEngResearch/flexWecDesignOpt
3 ---> https://github.com/RubenImhoff/Large_Sample_Nowcasting_Evaluation
6 ---> https://github.com/TreeCmp/TreeCmpWEB
3 ---> https://github.com/TaufiqHassan/acccmip6
3 ---> https://github

3 ---> https://github.com/waqasbhatti/astrobase
3 ---> https://github.com/TECH-UB-24-Programming-DS-Spring-2020/notebooks
3 ---> https://github.com/hidrokit/hidrokit
3 ---> https://github.com/biosustain/gnomic
3 ---> https://github.com/digicademy/AskMoreXtension
3 ---> https://github.com/jnicoleoliveira/SPECData
3 ---> https://github.com/glotzerlab/signac-flow
5 ---> https://github.com/erdc/spt_compute
3 ---> https://github.com/maayane/PhotoManip
9 ---> https://github.com/ufbmi/mdc_tree
3 ---> https://github.com/VlachosGroup/vunits
3 ---> https://github.com/rhshah/iCallSV
3 ---> https://github.com/widdowquinn/SI_Holmes_etal_2020
3 ---> https://github.com/vemomoto/vemomoto
5 ---> https://github.com/nschloe/tikzplotlib
7 ---> https://github.com/msmbuilder/msmbuilder
3 ---> https://github.com/dmolina/shadeils
5 ---> https://github.com/spacetelescope/mirage
5 ---> https://github.com/dpla/ingestion
9 ---> https://github.com/bigbadcrad/PUFFIN
3 ---> https://github.com/Ouranosinc/xclim
5 --->

In [10]:
topic_dict

{3: ['https://github.com/AgriculturalModelExchangeInitiative/Crop2ML',
  'https://github.com/hlgirard/CrystalML',
  'https://github.com/houghb/ligpy',
  'https://github.com/oschwengers/referenceseeker',
  'https://github.com/LPDI-EPFL/trivalent_cocktail',
  'https://github.com/CIRA-Pulsars-and-Transients-Group/vcstools',
  'https://github.com/usc-isi-i2/kgtk',
  'https://github.com/garciagenrique/template_project_escape',
  'https://github.com/williamjameshandley/anesthetic',
  'https://github.com/yardencsGitHub/tweetynet',
  'https://github.com/jagalindo/A-Python-QX-implementation',
  'https://github.com/jbkinney/mavenn',
  'https://github.com/GeoCode-polymtl/Seis_float16',
  'https://github.com/sebp/scikit-survival',
  'https://github.com/Biophotonics-COMI/flimview',
  'https://github.com/joshspeagle/brutus',
  'https://github.com/TDAmeritrade/stumpy',
  'https://github.com/habi/acinar-analysis',
  'https://github.com/mbernste/hypothesis-driven-SRA-queries',
  'https://github.com/max

In [11]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'lda.html')