# github挖掘案例

## 1. 通过API访问GitHub

In [3]:
import requests
from getpass import getpass
import json

# 最老老实实的办法当然是通过官方给的API访问github获取一些信息
username = 'Hanxiaoyang' # Your GitHub username
password = '' # Your GitHub password

# Note that credentials will be transmitted over a secure SSL connection
url = 'https://api.github.com/authorizations'
note = 'Julyedu Data Mining Class'
post_data = {'scopes':['repo'],'note': note }

response = requests.post(
    url,
    auth = (username, password),
    data = json.dumps(post_data),
    )   

print "API response:", response.text
print
print "Your OAuth token is", response.json()['token']

# Go to https://github.com/settings/applications to revoke this token

 API response: {"id":41820802,"url":"https://api.github.com/authorizations/41820802","app":{"name":"Julyedu Data Mining Class","url":"https://developer.github.com/v3/oauth_authorizations/","client_id":"00000000000000000000"},"token":"5ce8736ef7fb912ea8c2dd86ca9f62a44f7a28a2","hashed_token":"dab9b8442aef7f64336d2633fe96d1b0054bd8d8a69cdd3813fcd1eb9e0f0c50","token_last_eight":"4f7a28a2","note":"Julyedu Data Mining Class","note_url":null,"created_at":"2016-07-23T02:09:15Z","updated_at":"2016-07-23T02:09:15Z","scopes":["repo"],"fingerprint":null}

Your OAuth token is 5ce8736ef7fb912ea8c2dd86ca9f62a44f7a28a2


## 2. 直接HTTP请求获取GitHub加星信息

In [4]:
import json
import requests

url = "https://api.github.com/repos/HanXiaoyang/Kaggle_Titanic/stargazers"
response = requests.get(url)

# 展示所有star的同学
print json.dumps(response.json()[0], indent=1)
print

# Display headers
for (k,v) in response.headers.items():
    print k, "=>", v

{
 "following_url": "https://api.github.com/users/Jerusalemsbell/following{/other_user}", 
 "events_url": "https://api.github.com/users/Jerusalemsbell/events{/privacy}", 
 "organizations_url": "https://api.github.com/users/Jerusalemsbell/orgs", 
 "url": "https://api.github.com/users/Jerusalemsbell", 
 "gists_url": "https://api.github.com/users/Jerusalemsbell/gists{/gist_id}", 
 "html_url": "https://github.com/Jerusalemsbell", 
 "subscriptions_url": "https://api.github.com/users/Jerusalemsbell/subscriptions", 
 "avatar_url": "https://avatars.githubusercontent.com/u/11695808?v=3", 
 "repos_url": "https://api.github.com/users/Jerusalemsbell/repos", 
 "received_events_url": "https://api.github.com/users/Jerusalemsbell/received_events", 
 "gravatar_id": "", 
 "starred_url": "https://api.github.com/users/Jerusalemsbell/starred{/owner}{/repo}", 
 "site_admin": false, 
 "login": "Jerusalemsbell", 
 "type": "User", 
 "id": 11695808, 
 "followers_url": "https://api.github.com/users/Jerusalemsbel

## 3. 看看networkx这个库怎么用

In [5]:
import networkx as nx

# 初始化一个有向图
g = nx.DiGraph()

# 加一条X到Y的边
g.add_edge('X', 'Y')

# 输出图相关统计信息
print nx.info(g)
print

# 获得图的节点和边
print "Nodes:", g.nodes()
print "Edges:", g.edges()
print

# 获得节点信息
print "X props:", g.node['X']
print "Y props:", g.node['Y']

# 获得边信息
print "X=>Y props:", g['X']['Y']
print

# 手动更新节点信息
g.node['X'].update({'prop1' : 'value1'})
print "X props:", g.node['X']
print

# 手动更新边信息
g['X']['Y'].update({'label' : 'label1'})
print "X=>Y props:", g['X']['Y']

Name: 
Type: DiGraph
Number of nodes: 2
Number of edges: 1
Average in degree:   0.5000
Average out degree:   0.5000

Nodes: ['Y', 'X']
Edges: [('X', 'Y')]

X props: {}
Y props: {}
X=>Y props: {}

X props: {'prop1': 'value1'}

X=>Y props: {'label': 'label1'}


## 4.对特定的repository进行访问

In [6]:
from github import Github

# 最前面API获得的token
ACCESS_TOKEN = '5ce8736ef7fb912ea8c2dd86ca9f62a44f7a28a2'

# 指定一个用户和项目
USER = 'hanxiaoyang'
REPO = 'Kaggle_Titanic'

client = Github(ACCESS_TOKEN, per_page=100)
user = client.get_user(USER)
repo = user.get_repo(REPO)

# 跑一遍所有加星的同学
stargazers = [ s for s in repo.get_stargazers() ]
print "总共加星人数", len(stargazers)

总共加星人数 12


## 5. 构建上述repository的加星信息图

In [7]:
# 逐一地把节点和边加到网络当中去
import networkx as nx

g = nx.DiGraph()
g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login)

for sg in stargazers:
    g.add_node(sg.login + '(user)', type='user')
    g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes')

## 6. 一些简单的图操作

In [9]:
# 看看NetworkX这个库可以有哪些简单的操作

print nx.info(g)
print
print g.node['Kaggle_Titanic(repo)']
print g.node['Jerusalemsbell(user)']
print
print g['Jerusalemsbell(user)']
print g['Kaggle_Titanic(repo)']
print
print g.in_edges(['Jerusalemsbell(user)'])
print g.out_edges(['Jerusalemsbell(user)'])
print
print "入链/边", g.in_edges(['Kaggle_Titanic(repo)'])
print "出链/边", g.out_edges(['Kaggle_Titanic(repo)'])

Name: 
Type: DiGraph
Number of nodes: 13
Number of edges: 12
Average in degree:   0.9231
Average out degree:   0.9231

{'lang': u'Jupyter Notebook', 'owner': u'HanXiaoyang', 'type': 'repo'}
{'type': 'user'}

{u'Kaggle_Titanic(repo)': {'type': 'gazes'}}
{}

[]
[('Jerusalemsbell(user)', u'Kaggle_Titanic(repo)')]

入链/边 [(u'whitepaper(user)', 'Kaggle_Titanic(repo)'), (u'Kang-An(user)', 'Kaggle_Titanic(repo)'), (u'wellbeing18(user)', 'Kaggle_Titanic(repo)'), (u'flywilson(user)', 'Kaggle_Titanic(repo)'), (u'TAO-Liang(user)', 'Kaggle_Titanic(repo)'), (u'huyanping(user)', 'Kaggle_Titanic(repo)'), (u'kwafok(user)', 'Kaggle_Titanic(repo)'), (u'Jerusalemsbell(user)', 'Kaggle_Titanic(repo)'), (u'goodluckMrlee(user)', 'Kaggle_Titanic(repo)'), (u'ChengduoZhao(user)', 'Kaggle_Titanic(repo)'), (u'happyjohann(user)', 'Kaggle_Titanic(repo)'), (u'fuhuamosi(user)', 'Kaggle_Titanic(repo)')]
出链/边 []


## 7. 扩展网络，添加这些加星user的follower

In [11]:
# 继续条件标星的这些user的followers.

import sys

for i, sg in enumerate(stargazers):
    
    # 遍历添加关注者
    try:
        for follower in sg.get_followers():
            if follower.login + '(user)' in g:
                g.add_edge(follower.login + '(user)', sg.login + '(user)', 
                           type='follows')
    except Exception, e: #ssl.SSLError
        print >> sys.stderr, "Encountered an error fetching followers for", \
                             sg.login, "Skipping."
        print >> sys.stderr, e

    print "Processed", i+1, " stargazers. Num nodes/edges in graph", \
          g.number_of_nodes(), "/", g.number_of_edges()
    print "Rate limit remaining", client.rate_limiting

Processed 1  stargazers. Num nodes/edges in graph 13 / 12
Rate limit remaining (4939, 5000)
Processed 2  stargazers. Num nodes/edges in graph 13 / 12
Rate limit remaining (4938, 5000)
Processed 3  stargazers. Num nodes/edges in graph 13 / 12
Rate limit remaining (4937, 5000)
Processed 4  stargazers. Num nodes/edges in graph 13 / 12
Rate limit remaining (4936, 5000)
Processed 5  stargazers. Num nodes/edges in graph 13 / 12
Rate limit remaining (4935, 5000)
Processed 6  stargazers. Num nodes/edges in graph 13 / 12
Rate limit remaining (4934, 5000)
Processed 7  stargazers. Num nodes/edges in graph 13 / 12
Rate limit remaining (4933, 5000)
Processed 8  stargazers. Num nodes/edges in graph 13 / 12
Rate limit remaining (4932, 5000)
Processed 9  stargazers. Num nodes/edges in graph 13 / 12
Rate limit remaining (4931, 5000)
Processed 10  stargazers. Num nodes/edges in graph 13 / 12
Rate limit remaining (4930, 5000)
Processed 11  stargazers. Num nodes/edges in graph 13 / 12
Rate limit remaining

## 8. 分析图上新加的这些边

In [12]:
from operator import itemgetter
from collections import Counter

# 看看新加了多少边
print nx.info(g)
print

# The number of "follows" edges is the difference
print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
print

# The repository owner is possibly one of the more popular users in this graph.
print len([e 
           for e in g.edges_iter(data=True) 
               if e[2]['type'] == 'follows' and e[1] == 'ptwobrussell(user)'])
print

# Let's examine the number of adjacent edges to each node
print sorted([n for n in g.degree_iter()], key=itemgetter(1), reverse=True)[:10]
print

# Consider the ratio of incoming and outgoing edges for a couple of users with 
# high node degrees...

# A user who follows many but is not followed back by many.

print len(g.out_edges('hcilab(user)'))
print len(g.in_edges('hcilab(user)'))
print

# A user who is followed by many but does not follow back.

print len(g.out_edges('ptwobrussell(user)'))
print len(g.in_edges('ptwobrussell(user)'))
print

c = Counter([e[1] for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
popular_users = [ (u, f) for (u, f) in c.most_common() if f > 1 ]
print "Number of popular users", len(popular_users)
print "Top 10 popular users:", popular_users[:10]

Name: 
Type: DiGraph
Number of nodes: 13
Number of edges: 12
Average in degree:   0.9231
Average out degree:   0.9231

0

0

[(u'Kaggle_Titanic(repo)', 12), (u'whitepaper(user)', 1), (u'Kang-An(user)', 1), (u'flywilson(user)', 1), (u'TAO-Liang(user)', 1), (u'huyanping(user)', 1), (u'kwafok(user)', 1), (u'Jerusalemsbell(user)', 1), (u'goodluckMrlee(user)', 1), (u'ChengduoZhao(user)', 1)]

0
0

0
0

Number of popular users 0
Top 10 popular users: []


## 9. 存储当前分析的结果

In [13]:
# 存储当前分析的结果
nx.write_gpickle(g, "github.gpickle.1")

# 载入信息
# import networkx as nx
# g = nx.read_gpickle("resources/ch07-github/data/github.gpickle.1")

## 10. 添加加星的代码项目作为图的边

In [14]:
# 我们添加每个加星user加星过后的代码项目作为图的边

MAX_REPOS = 500

for i, sg in enumerate(stargazers):
    print sg.login
    try:
        for starred in sg.get_starred()[:MAX_REPOS]: # Slice to avoid supernodes
            g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, \
                       owner=starred.owner.login)
            g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes')
    except Exception, e: #ssl.SSLError:
        print "Encountered an error fetching starred repos for", sg.login, "Skipping."

    print "Processed", i+1, "stargazers' starred repos"
    print "Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges()
    print "Rate limit", client.rate_limiting

Jerusalemsbell
Processed 1 stargazers' starred repos
Num nodes/edges in graph 25 / 24
Rate limit (4927, 5000)
kwafok
Processed 2 stargazers' starred repos
Num nodes/edges in graph 57 / 57
Rate limit (4926, 5000)
whitepaper
Processed 3 stargazers' starred repos
Num nodes/edges in graph 443 / 450
Rate limit (4922, 5000)
happyjohann
Processed 4 stargazers' starred repos
Num nodes/edges in graph 519 / 529
Rate limit (4921, 5000)
flywilson
Processed 5 stargazers' starred repos
Num nodes/edges in graph 522 / 533
Rate limit (4920, 5000)
goodluckMrlee
Processed 6 stargazers' starred repos
Num nodes/edges in graph 531 / 547
Rate limit (4919, 5000)
huyanping
Processed 7 stargazers' starred repos
Num nodes/edges in graph 985 / 1033
Rate limit (4914, 5000)
Kang-An
Processed 8 stargazers' starred repos
Num nodes/edges in graph 991 / 1040
Rate limit (4913, 5000)
TAO-Liang
Processed 9 stargazers' starred repos
Num nodes/edges in graph 1068 / 1135
Rate limit (4912, 5000)
ChengduoZhao
Processed 10 star

In [15]:
# 把得到的结果存下来
nx.write_gpickle(g, "github.gpickle.2")

#import networkx as nx
#g = nx.read_gpickle("resources/ch07-github/data/github.gpickle.2")

## 11.再扩展一下图，加上标星的代码项目作为边

In [24]:
from operator import itemgetter

print nx.info(g)
print

# 从刚才补充过的图里面，找到所有代码项目repos

repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo']

# 找到最热门的代码项目repos
print "热门的代码项目/repositories"
print sorted([(n,d) 
              for (n,d) in g.in_degree_iter() 
                  if g.node[n]['type'] == 'repo'], \
             key=itemgetter(1), reverse=True)[:10]
print

# Projects gazed at by a user
print "huyanping收藏的代码项目/Respositories"
print [(n,g.node[n]['lang']) 
       for n in g['huyanping(user)'] 
           if g['huyanping(user)'][n]['type'] == 'gazes']
print

# Programming languages for each user

print "huyanping感兴趣的编程语言"
print list(set([g.node[n]['lang'] 
                for n in g['huyanping(user)'] 
                    if g['huyanping(user)'][n]['type'] == 'gazes']))
print

# 找到活跃用户，给了很多star，即出链非常多的
print "超级节点，出链非常多"
print sorted([(n, len(g.out_edges(n))) 
              for n in g.nodes_iter() 
                  if g.node[n]['type'] == 'user' and len(g.out_edges(n)) > 300], \
             key=itemgetter(1), reverse=True)

Name: 
Type: DiGraph
Number of nodes: 1437
Number of edges: 1548
Average in degree:   1.0772
Average out degree:   1.0772

热门的代码项目/repositories
[(u'Kaggle_Titanic(repo)', 12), (u'mxnet(repo)', 5), (u'tensorflow(repo)', 5), (u'leetcode(repo)', 4), (u'awesome-machine-learning(repo)', 4), (u'scikit-learn(repo)', 4), (u'Qix(repo)', 4), (u'The-Art-Of-Programming-By-July(repo)', 3), (u'keras(repo)', 3), (u'data-science-ipython-notebooks(repo)', 3)]

huyanping收藏的代码项目/Respositories
[(u'cronkeep(repo)', u'PHP'), (u'php7cc(repo)', u'PHP'), (u'delete-set(repo)', u'Java'), (u'psr7(repo)', u'PHP'), (u'jgraphx(repo)', u'Java'), (u'JG_Cache(repo)', u'PHP'), (u'the-way-to-go_ZH_CN(repo)', u'Go'), (u'php-jsond(repo)', u'C'), (u'presto(repo)', u'Java'), (u'spring-framework-4-reference(repo)', None), (u'php-rdkafka(repo)', u'C'), (u'node-zk-browser(repo)', u'JavaScript'), (u'angular-websocket(repo)', u'JavaScript'), (u'queues.io(repo)', u'HTML'), (u'hbasedoc_cn(repo)', u'CSS'), (u'weiszfeld-median(repo)'

## 12.丰富一下图，把编程语言加到原有图上

In [37]:
# Iterate over all of the repos, and add edges for programming languages 
# for each person in the graph. We'll also add edges back to repos so that 
# we have a good point to "pivot" upon.

repos = [n 
         for n in g.nodes_iter() 
             if g.node[n]['type'] == 'repo']

for repo in repos:
    lang = (g.node[repo]['lang'] or "") + "(lang)"
    
    stargazers = [u 
                  for (u, r, d) in g.in_edges_iter(repo, data=True) 
                     if d['type'] == 'gazes'
                 ]
    
    for sg in stargazers:
        g.add_node(lang, type='lang')
        g.add_edge(sg, lang, type='programs')
        g.add_edge(lang, repo, type='implements')

## 13.再来一些分析和查询

In [38]:
# Some stats

print nx.info(g)
print

# 我们这个图里面有哪些编程语言?
print [n 
       for n in g.nodes_iter() 
           if g.node[n]['type'] == 'lang']
print

# 用户一般用什么编程语言?
print [n 
       for n in g['huyanping(user)'] 
           if g['huyanping(user)'][n]['type'] == 'programs']

# 最流行的编程语言?
print "最流行的编程语言"
print sorted([(n, g.in_degree(n))
 for n in g.nodes_iter() 
     if g.node[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10]
print

python_programmers = [u 
                      for (u, l) in g.in_edges_iter('Python(lang)') 
                          if g.node[u]['type'] == 'user']
print "用Python的同学数量:", len(python_programmers)
print

javascript_programmers = [u for 
                          (u, l) in g.in_edges_iter('JavaScript(lang)') 
                              if g.node[u]['type'] == 'user']
print "用JavaScript的同学数量:", len(javascript_programmers)
print

# 用JavaScript和Python的同学数量
print "用JavaScript和Python的同学数量"
print len(set(python_programmers).intersection(set(javascript_programmers)))

# 用JavaScript不用Python的同学数量
print "用JavaScript不用Python的同学数量"
print len(set(javascript_programmers).difference(set(python_programmers)))

Name: 
Type: DiGraph
Number of nodes: 1481
Number of edges: 3132
Average in degree:   2.1148
Average out degree:   2.1148

[u'Cuda(lang)', u'CoffeeScript(lang)', u'Processing(lang)', u'C(lang)', u'PowerShell(lang)', u'Swift(lang)', u'Go(lang)', u'PHP(lang)', u'OCaml(lang)', u'Tcl(lang)', u'Ruby(lang)', u'C#(lang)', u'Lua(lang)', u'Clojure(lang)', u'VimL(lang)', u'Jupyter Notebook(lang)', u'TypeScript(lang)', u'Perl(lang)', u'Java(lang)', u'HTML(lang)', '(lang)', u'JavaScript(lang)', u'OpenEdge ABL(lang)', u'TeX(lang)', u'ApacheConf(lang)', u'C++(lang)', u'Haskell(lang)', u'Batchfile(lang)', u'Makefile(lang)', u'Matlab(lang)', u'R(lang)', u'GCC Machine Description(lang)', u'Python(lang)', u'Emacs Lisp(lang)', u'CSS(lang)', u'Scheme(lang)', u'FORTRAN(lang)', u'Scala(lang)', u'Objective-C(lang)', u'Shell(lang)', u'Lex(lang)', u'XSLT(lang)', u'Hack(lang)', u'Erlang(lang)']

[u'VimL(lang)', u'TypeScript(lang)', u'Scala(lang)', u'Perl(lang)', u'C(lang)', u'Java(lang)', u'HTML(lang)', u'Lua(l

In [29]:
# 把结果存下来
nx.write_gpickle(g, "github.gpickle.3")

#import networkx as nx
#g = nx.read_gpickle("github.gpickle.3")