In [148]:
import markdown
import urllib2
import re
import pandas as pd
from markdown.treeprocessors import Treeprocessor
from markdown.extensions import Extension
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [18]:
URLS = ['https://raw.githubusercontent.com/sindresorhus/awesome/master/readme.md', 
        'https://raw.githubusercontent.com/bayandin/awesome-awesomeness/master/README.md']





In [131]:
BANS = ["Awesome-Awesomeness.ZEEF.com", "awesome-awesomeness"]
BAN_LINKS = [
    "https://github.com/bayandin/awesome-awesomeness",
    "https://github.com/sindresorhus/awesome"
]

class CodeBlockTweaker(Treeprocessor):
    def __init__(self, md):
        self.cur_level = []
        self.last_a = False
        self.output = []
        super(CodeBlockTweaker, self).__init__(md)
    def run(self, root):
        return self.setClass(root)
    def setClass(self, element):
        for child in element:
            if child.tag in ["a", "p", "li"]  and child.text:
                self.last_a = child.text
            if child.tag == "a":
                #print(self.cur_level, child.text, child.get("href"))
                #Out put only github repos
                
                href = child.get("href")                    
                is_repo = re.match('^https://github.com/\w+\/\w+', href)
                if is_repo is not None and href not in BAN_LINKS:                    
                    self.output.append((self.cur_level + [child.text], href))
            elif child.tag in ["h2", "h3"]:
                if  self.cur_level:
                    self.cur_level.pop()
                self.cur_level.append(child.text)
                self.last_a = False
            elif child.tag == "ul" and self.last_a and self.last_a not in BANS:
                self.cur_level.append(self.last_a)
            # run recursively on children    
            child = self.setClass(child)
            
            if child.tag == "ul" and self.last_a and self.cur_level:                
                self.cur_level.pop()
        return element



class CodeBlockExtension(Extension):
    def extendMarkdown(self, md, md_globals):
        self.tweaker = CodeBlockTweaker(md)
        md.treeprocessors.add('codeblocktweaker', self.tweaker, '_end')

ext = CodeBlockExtension()
        

    

In [79]:
def parse_md(URL, ext, debug = False):
    # Assign the open file to a variable
    webFile = urllib2.urlopen(URL)
    # Read the file contents to a variable
    file_contents = webFile.read()
    
    html = markdown.markdown(text=unicode(file_contents, 'utf-8'), extensions=[ext])
    if debug:
        display(HTML(html))
    return ext.tweaker.output
    




## Parse awesome-awesome repos

In [85]:
res = []
for URL in URLS:
    out = pd.DataFrame(parse_md(URL, ext), columns=["categories", "repo_url"])
    out["source"] = URL
    res.append(out)

all_awe = pd.concat(res, ignore_index=True)
orig_len = all_awe.shape[0]
all_awe.drop_duplicates(subset=["repo_url"], inplace=True)
if all_awe.shape[0] < orig_len:
    print("Drop %s items" % (orig_len - all_awe.shape[0]))
all_awe["repo"] = all_awe["repo_url"].str.replace("https://github.com/", "")    

Drop 91 items


In [128]:
def get_readme(repo):
    START = 'https://raw.githubusercontent.com/'
    for filename in ["/master/README.md", "/master/readme.md"]:
        try:
            webFile = urllib2.urlopen(START + repo + filename)
            file_contents = webFile.read()
            return file_contents
        except urllib2.HTTPError, e:
            #print(e)
            if e.code == 404:
                continue
            else:
                print 'We failed with error code - %s.' % e.code
    return None


In [138]:
%%time

res2 = []
for repo in all_awe["repo"].values:
    file_contents = get_readme(repo)
    print("Got content for %s" % repo)
    if file_contents:
        html = markdown.markdown(text=unicode(file_contents, 'utf-8'), extensions=[ext])
        #display(HTML(html))
        out = pd.DataFrame(ext.tweaker.output, columns=["categories", "repo_url"])
        out["source_repo"] = repo
        res2.append(out)
        
repos = pd.concat(res2, ignore_index=True)
repos_len = repos.shape[0]
repos.drop_duplicates(subset=["repo_url"], inplace=True)
if repos.shape[0] < repos_len:
    print("Drop %s items" % (repos_len - repos.shape[0]))
repos["repo"] = repos["repo_url"].str.replace("https://github.com/", "")

Got content for sindresorhus/awesome-nodejs
Got content for dypsilon/frontend-dev-bookmarks
Got content for vsouza/awesome-ios
Got content for JStumpp/awesome-android
Got content for weblancaster/awesome-IoT-hybrid
Got content for sindresorhus/awesome-electron
Got content for busterc/awesome-cordova
Got content for jondot/awesome-react-native
Got content for benoitjadinon/awesome-xamarin
Got content for aleksandar-todorovic/awesome-linux
Got content for Friz-zy/awesome-linux-containers
Got content for iCHAIT/awesome-macOS
Got content for herrbischoff/awesome-osx-command-line
Got content for aharris88/awesome-osx-screensavers
Got content for yenchenlin/awesome-watchos
Got content for deephacks/awesome-jvm
Got content for mailtoharshit/awesome-salesforce
Got content for donnemartin/awesome-aws
Got content for Awesome-Windows/Awesome
Got content for ipfs/awesome-ipfs
Got content for vinkla/awesome-fuse
Got content for ianstormtaylor/awesome-heroku
Got content for sorrycc/awesome-javascrip

In [139]:
repos.drop(["repo_url"], axis=1).to_csv("all_repos.csv", encoding="utf-8")
repos.drop(["repo_url"], axis=1).to_csv("all_repos.csv", encoding="utf-8")

In [142]:
repos

Unnamed: 0,categories,repo_url,source_repo,repo
0,[awesome-npm],https://github.com/sindresorhus/awesome-npm,sindresorhus/awesome-nodejs,sindresorhus/awesome-npm
1,"[Mad science, webtorrent]",https://github.com/feross/webtorrent,sindresorhus/awesome-nodejs,feross/webtorrent
2,"[Mad science, GitTorrent]",https://github.com/cjb/GitTorrent,sindresorhus/awesome-nodejs,cjb/GitTorrent
3,"[Mad science, peerflix]",https://github.com/mafintosh/peerflix,sindresorhus/awesome-nodejs,mafintosh/peerflix
4,"[Mad science, ipfs]",https://github.com/ipfs/js-ipfs,sindresorhus/awesome-nodejs,ipfs/js-ipfs
5,"[Mad science, peerwiki]",https://github.com/mafintosh/peerwiki,sindresorhus/awesome-nodejs,mafintosh/peerwiki
6,"[Mad science, peercast]",https://github.com/mafintosh/peercast,sindresorhus/awesome-nodejs,mafintosh/peercast
7,"[Mad science, turf]",https://github.com/Turfjs/turf,sindresorhus/awesome-nodejs,Turfjs/turf
8,"[Mad science, webcat]",https://github.com/mafintosh/webcat,sindresorhus/awesome-nodejs,mafintosh/webcat
9,"[Mad science, js-git]",https://github.com/creationix/js-git,sindresorhus/awesome-nodejs,creationix/js-git


## Get readme url for each repo

In [98]:
from github import Github
from getpass import getpass

# XXX: Specify your own access token here

ACCESS_TOKEN = getpass(prompt="Enter github token:")
client = Github(ACCESS_TOKEN, per_page=100)

Enter github token:········


In [None]:
import sys

#loop over repos
#for each repo get contributors
#for each contributor get followers

repo_info.get_contributors()

for i, sg in enumerate(stargazers):
    
    # Add "follows" edges between stargazers in the graph if any relationships exist
    try:
        for follower in sg.get_followers():
            if follower.login + '(user)' in g:
                g.add_edge(follower.login + '(user)', sg.login + '(user)', 
                           type='follows')
    except Exception, e: #ssl.SSLError
        print >> sys.stderr, "Encountered an error fetching followers for", \
                             sg.login, "Skipping."
        print >> sys.stderr, e

    print "Processed", i+1, " stargazers. Num nodes/edges in graph", \
          g.number_of_nodes(), "/", g.number_of_edges()
    print "Rate limit remaining", client.rate_limiting

In [188]:
repo_info = client.get_repo("feross/webtorrent")
repo_info

<github.Repository.Repository at 0x7fe12ebcc850>

In [189]:
content = repo_info.get_readme()

In [203]:
repo_info.description

u':zap: Streaming torrent client for the web'

In [193]:
pl = repo_info.get_contributors()

In [197]:
help(pl)

Help on instance of PaginatedList in module github.PaginatedList:

class PaginatedList(PaginatedListBase)
 |  This class abstracts the `pagination of the API <http://developer.github.com/v3/#pagination>`_.
 |  
 |  You can simply enumerate through instances of this class::
 |  
 |      for repo in user.get_repos():
 |          print repo.name
 |  
 |  You can also index them or take slices::
 |  
 |      second_repo = user.get_repos()[1]
 |      first_repos = user.get_repos()[:10]
 |  
 |  If you want to iterate in reversed order, just do::
 |  
 |      for repo in user.get_repos().reversed:
 |          print repo.name
 |  
 |  And if you really need it, you can explicitely access a specific page::
 |  
 |      some_repos = user.get_repos().get_page(0)
 |      some_other_repos = user.get_repos().get_page(3)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, contentClass, requester, firstUrl, firstParams, headers=None)
 |  
 |  get_page(self, page)
 |  
 |  -------------------------

In [196]:
for p in pl:
    print(p.raw_data)

{u'public_repos': 264, u'site_admin': False, u'subscriptions_url': u'https://api.github.com/users/feross/subscriptions', u'gravatar_id': u'', u'hireable': None, u'id': 121766, u'followers_url': u'https://api.github.com/users/feross/followers', u'following_url': u'https://api.github.com/users/feross/following{/other_user}', u'blog': u'http://feross.org', u'followers': 4665, u'location': u'Mountain View, CA', u'type': u'User', u'email': u'feross@feross.org', u'bio': u'\u2b50\u2b50\u2b50\u2b50\u2b50 Mad Scientist', u'gists_url': u'https://api.github.com/users/feross/gists{/gist_id}', u'company': u"The man can't keep me down!", u'events_url': u'https://api.github.com/users/feross/events{/privacy}', u'html_url': u'https://github.com/feross', u'updated_at': u'2016-08-25T10:59:57Z', u'received_events_url': u'https://api.github.com/users/feross/received_events', u'starred_url': u'https://api.github.com/users/feross/starred{/owner}{/repo}', u'public_gists': 15, u'name': u'Feross Aboukhadijeh', 

In [None]:
ext2 = CodeBlockExtension()
res = []
for URL in URLS:
    out = pd.DataFrame(parse_md(URL, ext), columns=["categories", "repo_url"])
    out["source"] = URL
    res.append(out)

all_awe = pd.concat(res, ignore_index=True)
orig_len = all_awe.shape[0]
all_awe.drop_duplicates(subset=["repo_url"], inplace=True)
if all_awe.shape[0] < orig_len:
    print("Drop %s items" % (orig_len - all_awe.shape[0]))

https://github.com/sindresorhus/awesome-nodejs    
https://github.com/sindresorhus/awesome-nodejs/blob/master/readme.md    

In [94]:
all_awe.shape

(377, 3)

In [172]:
count_repos = repos.groupby(["source_repo"], as_index=False).agg("count").sort_values(["repo"], ascending=False)
repos2 = pd.merge(repos, count_repos, on=["source_repo"])

In [173]:
repos2.columns

Index([u'categories_x', u'repo_url_x', u'source_repo', u'repo_x',
       u'categories_y', u'repo_url_y', u'repo_y'],
      dtype='object')

In [180]:
#Filter out repos with less then 100 items
repos3 = repos2[repos2["categories_y"] > 100]


In [202]:
repos3.rename(columns={
        "categories_x": "categories",
        "repo_url_x": "repo_url",
        "repo_x": "repo",
        "categories_y": "items_count"
    }).drop(["repo_url_y", "repo_y"], axis=1).to_csv("awe_repos_100_in_cats.csv", encoding="utf-8", index=False)