In [148]:
import markdown
import urllib2
import re
import pandas as pd
from markdown.treeprocessors import Treeprocessor
from markdown.extensions import Extension
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [18]:
URLS = ['https://raw.githubusercontent.com/sindresorhus/awesome/master/readme.md', 
        'https://raw.githubusercontent.com/bayandin/awesome-awesomeness/master/README.md']





In [131]:
BANS = ["Awesome-Awesomeness.ZEEF.com", "awesome-awesomeness"]
BAN_LINKS = [
    "https://github.com/bayandin/awesome-awesomeness",
    "https://github.com/sindresorhus/awesome"
]

class CodeBlockTweaker(Treeprocessor):
    def __init__(self, md):
        self.cur_level = []
        self.last_a = False
        self.output = []
        super(CodeBlockTweaker, self).__init__(md)
    def run(self, root):
        return self.setClass(root)
    def setClass(self, element):
        for child in element:
            if child.tag in ["a", "p", "li"]  and child.text:
                self.last_a = child.text
            if child.tag == "a":
                #print(self.cur_level, child.text, child.get("href"))
                #Out put only github repos
                
                href = child.get("href")                    
                is_repo = re.match('^https://github.com/\w+\/\w+', href)
                if is_repo is not None and href not in BAN_LINKS:                    
                    self.output.append((self.cur_level + [child.text], href))
            elif child.tag in ["h2", "h3"]:
                if  self.cur_level:
                    self.cur_level.pop()
                self.cur_level.append(child.text)
                self.last_a = False
            elif child.tag == "ul" and self.last_a and self.last_a not in BANS:
                self.cur_level.append(self.last_a)
            # run recursively on children    
            child = self.setClass(child)
            
            if child.tag == "ul" and self.last_a and self.cur_level:                
                self.cur_level.pop()
        return element



class CodeBlockExtension(Extension):
    def extendMarkdown(self, md, md_globals):
        self.tweaker = CodeBlockTweaker(md)
        md.treeprocessors.add('codeblocktweaker', self.tweaker, '_end')

ext = CodeBlockExtension()
        

    

In [79]:
def parse_md(URL, ext, debug = False):
    # Assign the open file to a variable
    webFile = urllib2.urlopen(URL)
    # Read the file contents to a variable
    file_contents = webFile.read()
    
    html = markdown.markdown(text=unicode(file_contents, 'utf-8'), extensions=[ext])
    if debug:
        display(HTML(html))
    return ext.tweaker.output
    




## Parse awesome-awesome repos

In [85]:
res = []
for URL in URLS:
    out = pd.DataFrame(parse_md(URL, ext), columns=["categories", "repo_url"])
    out["source"] = URL
    res.append(out)

all_awe = pd.concat(res, ignore_index=True)
orig_len = all_awe.shape[0]
all_awe.drop_duplicates(subset=["repo_url"], inplace=True)
if all_awe.shape[0] < orig_len:
    print("Drop %s items" % (orig_len - all_awe.shape[0]))
all_awe["repo"] = all_awe["repo_url"].str.replace("https://github.com/", "")    

Drop 91 items


In [128]:
def get_readme(repo):
    START = 'https://raw.githubusercontent.com/'
    for filename in ["/master/README.md", "/master/readme.md"]:
        try:
            webFile = urllib2.urlopen(START + repo + filename)
            file_contents = webFile.read()
            return file_contents
        except urllib2.HTTPError, e:
            #print(e)
            if e.code == 404:
                continue
            else:
                print 'We failed with error code - %s.' % e.code
    return None


In [138]:
%%time

res2 = []
for repo in all_awe["repo"].values:
    file_contents = get_readme(repo)
    print("Got content for %s" % repo)
    if file_contents:
        html = markdown.markdown(text=unicode(file_contents, 'utf-8'), extensions=[ext])
        #display(HTML(html))
        out = pd.DataFrame(ext.tweaker.output, columns=["categories", "repo_url"])
        out["source_repo"] = repo
        res2.append(out)
        
repos = pd.concat(res2, ignore_index=True)
repos_len = repos.shape[0]
repos.drop_duplicates(subset=["repo_url"], inplace=True)
if repos.shape[0] < repos_len:
    print("Drop %s items" % (repos_len - repos.shape[0]))
repos["repo"] = repos["repo_url"].str.replace("https://github.com/", "")

Got content for sindresorhus/awesome-nodejs
Got content for dypsilon/frontend-dev-bookmarks
Got content for vsouza/awesome-ios
Got content for JStumpp/awesome-android
Got content for weblancaster/awesome-IoT-hybrid
Got content for sindresorhus/awesome-electron
Got content for busterc/awesome-cordova
Got content for jondot/awesome-react-native
Got content for benoitjadinon/awesome-xamarin
Got content for aleksandar-todorovic/awesome-linux
Got content for Friz-zy/awesome-linux-containers
Got content for iCHAIT/awesome-macOS
Got content for herrbischoff/awesome-osx-command-line
Got content for aharris88/awesome-osx-screensavers
Got content for yenchenlin/awesome-watchos
Got content for deephacks/awesome-jvm
Got content for mailtoharshit/awesome-salesforce
Got content for donnemartin/awesome-aws
Got content for Awesome-Windows/Awesome
Got content for ipfs/awesome-ipfs
Got content for vinkla/awesome-fuse
Got content for ianstormtaylor/awesome-heroku
Got content for sorrycc/awesome-javascrip

In [139]:
repos.drop(["repo_url"], axis=1).to_csv("all_repos.csv", encoding="utf-8")
repos.drop(["repo_url"], axis=1).to_csv("all_repos.csv", encoding="utf-8")

In [142]:
repos

Unnamed: 0,categories,repo_url,source_repo,repo
0,[awesome-npm],https://github.com/sindresorhus/awesome-npm,sindresorhus/awesome-nodejs,sindresorhus/awesome-npm
1,"[Mad science, webtorrent]",https://github.com/feross/webtorrent,sindresorhus/awesome-nodejs,feross/webtorrent
2,"[Mad science, GitTorrent]",https://github.com/cjb/GitTorrent,sindresorhus/awesome-nodejs,cjb/GitTorrent
3,"[Mad science, peerflix]",https://github.com/mafintosh/peerflix,sindresorhus/awesome-nodejs,mafintosh/peerflix
4,"[Mad science, ipfs]",https://github.com/ipfs/js-ipfs,sindresorhus/awesome-nodejs,ipfs/js-ipfs
5,"[Mad science, peerwiki]",https://github.com/mafintosh/peerwiki,sindresorhus/awesome-nodejs,mafintosh/peerwiki
6,"[Mad science, peercast]",https://github.com/mafintosh/peercast,sindresorhus/awesome-nodejs,mafintosh/peercast
7,"[Mad science, turf]",https://github.com/Turfjs/turf,sindresorhus/awesome-nodejs,Turfjs/turf
8,"[Mad science, webcat]",https://github.com/mafintosh/webcat,sindresorhus/awesome-nodejs,mafintosh/webcat
9,"[Mad science, js-git]",https://github.com/creationix/js-git,sindresorhus/awesome-nodejs,creationix/js-git


## Get readme url for each repo

In [98]:
from github import Github
from getpass import getpass

# XXX: Specify your own access token here

ACCESS_TOKEN = getpass(prompt="Enter github token:")
client = Github(ACCESS_TOKEN, per_page=100)

Enter github token:········


In [None]:
import sys

for i, sg in enumerate(stargazers):
    
    # Add "follows" edges between stargazers in the graph if any relationships exist
    try:
        for follower in sg.get_followers():
            if follower.login + '(user)' in g:
                g.add_edge(follower.login + '(user)', sg.login + '(user)', 
                           type='follows')
    except Exception, e: #ssl.SSLError
        print >> sys.stderr, "Encountered an error fetching followers for", \
                             sg.login, "Skipping."
        print >> sys.stderr, e

    print "Processed", i+1, " stargazers. Num nodes/edges in graph", \
          g.number_of_nodes(), "/", g.number_of_edges()
    print "Rate limit remaining", client.rate_limiting

In [101]:
repo_info = client.get_repo("bayandin/awesome-awesomeness")
repo_info

<github.Repository.Repository at 0x7fe147ef6710>

In [104]:
content = repo_info.get_readme()

In [114]:
repo_info.raw_data

{u'archive_url': u'https://api.github.com/repos/bayandin/awesome-awesomeness/{archive_format}{/ref}',
 u'assignees_url': u'https://api.github.com/repos/bayandin/awesome-awesomeness/assignees{/user}',
 u'blobs_url': u'https://api.github.com/repos/bayandin/awesome-awesomeness/git/blobs{/sha}',
 u'branches_url': u'https://api.github.com/repos/bayandin/awesome-awesomeness/branches{/branch}',
 u'clone_url': u'https://github.com/bayandin/awesome-awesomeness.git',
 u'collaborators_url': u'https://api.github.com/repos/bayandin/awesome-awesomeness/collaborators{/collaborator}',
 u'comments_url': u'https://api.github.com/repos/bayandin/awesome-awesomeness/comments{/number}',
 u'commits_url': u'https://api.github.com/repos/bayandin/awesome-awesomeness/commits{/sha}',
 u'compare_url': u'https://api.github.com/repos/bayandin/awesome-awesomeness/compare/{base}...{head}',
 u'contents_url': u'https://api.github.com/repos/bayandin/awesome-awesomeness/contents/{+path}',
 u'contributors_url': u'https://a

In [110]:
st = repo_info.get_stats_contributors()
st

In [113]:
help(repo_info.get_stats_contributors)

Help on method get_stats_contributors in module github.Repository:

get_stats_contributors(self) method of github.Repository.Repository instance
    :calls: `GET /repos/:owner/:repo/stats/contributors <http://developer.github.com/v3/repos/statistics/#get-contributors-list-with-additions-deletions-and-commit-counts>`_
    :rtype: None or list of :class:`github.StatsContributor.StatsContributor`



In [None]:
ext2 = CodeBlockExtension()
res = []
for URL in URLS:
    out = pd.DataFrame(parse_md(URL, ext), columns=["categories", "repo_url"])
    out["source"] = URL
    res.append(out)

all_awe = pd.concat(res, ignore_index=True)
orig_len = all_awe.shape[0]
all_awe.drop_duplicates(subset=["repo_url"], inplace=True)
if all_awe.shape[0] < orig_len:
    print("Drop %s items" % (orig_len - all_awe.shape[0]))

https://github.com/sindresorhus/awesome-nodejs    
https://github.com/sindresorhus/awesome-nodejs/blob/master/readme.md    

In [94]:
all_awe.shape

(377, 3)

In [152]:
repos.groupby(["source_repo"]).size()

source_repo
0xAX/erlang-bookmarks                                  1
AllThingsSmitty/css-protips                            8
AllThingsSmitty/jquery-tips-everyone-should-know       6
AllThingsSmitty/must-watch-css                         2
AngularClass/awesome-angular2                         19
Awesome-Windows/Awesome                               12
Calinou/awesome-godot                                 56
ChristosChristofidis/awesome-deep-learning            27
ChromeDevTools/awesome-chrome-devtools                19
Codepoints/awesome-codepoints                          1
CodyReichert/awesome-cl                              103
Fr0sT-Brutal/awesome-delphi                           85
Friz-zy/awesome-linux-containers                      18
Granze/awesome-polymer                                20
J2TeaM/awesome-AutoIt                                  7
JStumpp/awesome-android                              130
JustServerless/awesome-serverless                     93
Kickball/awesome-se