In [1]:
# This script requires github3.py version 0.9.6
# pip install github3.py

import os
import pandas as pd
from github3 import login

GITHUB_API_TOKEN = os.environ.get('GITHUB_API_TOKEN')
gh = login(token=GITHUB_API_TOKEN)
LOCATION = 'Recife'

def queries_list(location=LOCATION):
    return (
        'location:{} followers:>1'.format(location),
        'location:{} followers:1'.format(location),
        'location:{} followers:0'.format(location),
    )

def search_users(query):
    return [u.user for u in gh.search_users(query, sort='followers')]


def fetch_gh_users():
    return [search_users(q) for q in queries_list()]


def create_users_df(batches):
    return pd.concat(
        [pd.DataFrame({'User': u}) for u in batches],
        ignore_index=True)


def followers_iter(gh_user):
    return ','.join([str(u) for u in gh_user.iter_followers()])


def map_f_batch(map_fn, users_iterator):
    return pd.DataFrame({'Followers': [map_fn(u) for u in users_iterator]})


def create_followers_df(batches):
    return pd.concat(
        [map_f_batch(followers_iter, u) for u in batches],
        ignore_index=True)


In [86]:
def repositories_iter(gh_user, repo_type):
    return ','.join([str(u) for u in gh.iter_user_repos(gh_user, type=repo_type)])

def map_r_batch(map_fn, users_iterator):
    return pd.DataFrame({'Owner':[map_fn(u, 'owner') for u in users_iterator],
                         'Member': [map_fn(u, 'member') for u in users_iterator]})

def create_repositories_df(batches):
    return pd.concat(
        [map_r_batch(repositories_iter, u) for u in batches], ignore_index=True)


In [89]:
ghusers_batches = fetch_gh_users()
#
gh_users = create_users_df(ghusers_batches)
# gh_users.to_csv('gh_users.csv', index=False)

gh_followers = create_followers_df(ghusers_batches)
gh_followers.columns = ['Followers']
# gh_followers.to_csv('gh_followers.csv', index=False)


In [91]:
gh_repositories = create_repositories_df(ghusers_batches)
gh_repositories.columns = ['Owner', 'Member']
gh_repositories.to_csv('gh_repositories.csv', index=False)

In [4]:
gh_users.describe()
#

Unnamed: 0,User
count,1950
unique,1936
top,gilmarvitor
freq,2


In [5]:
gh_followers.describe()

Unnamed: 0,Followers
count,1950.0
unique,1036.0
top,
freq,872.0


In [92]:
gh_repositories.describe()

Unnamed: 0,Owner,Member
count,1959.0,1959.0
unique,728.0,1577.0
top,,
freq,1210.0,362.0


In [6]:
gh_users.head()

Unnamed: 0,User
0,tarruda
1,mairatma
2,joselitojunior1
3,marcelcaraciolo
4,luanfonceca


In [93]:
gh_followers.head()

Unnamed: 0,Followers
0,"Sannis,danielmahon,csjaba,FergusRedican,Victor..."
1,"brunocoelho,henvic,eduardolundgren,aperrelli,a..."
2,"renatooliveira,jeffesonmaia,jotaefe,duartefq,J..."
3,"thiagoarrais,brunojm,henriquebastos,macndesign..."
4,"brunohenrique,luizvarela,gladson,lucasbibiano,..."


In [95]:
#gh_repositories.head()

In [7]:
gh_users_followers = gh_users.join(gh_followers)

In [8]:
gh_users_followers.head()

Unnamed: 0,User,Followers
0,tarruda,"Sannis,danielmahon,csjaba,FergusRedican,Victor..."
1,mairatma,"brunocoelho,henvic,eduardolundgren,aperrelli,a..."
2,joselitojunior1,"renatooliveira,jeffesonmaia,jotaefe,duartefq,J..."
3,marcelcaraciolo,"thiagoarrais,brunojm,henriquebastos,macndesign..."
4,luanfonceca,"brunohenrique,luizvarela,gladson,lucasbibiano,..."


In [9]:
gh_users_followers['Followers'].apply(len)

0       5435
1       3816
2       4158
3       3594
4       3403
5       2811
6       2736
7       2761
8       1878
9       1853
10      1808
11      1237
12      1438
13      1354
14      1446
15      1207
16      1261
17      1093
18      1060
19       933
20       991
21       832
22       944
23       810
24       867
25       797
26       740
27       673
28       746
29       642
        ... 
1920       0
1921       0
1922       0
1923       0
1924       0
1925       0
1926       0
1927       0
1928       0
1929       0
1930       0
1931       0
1932       0
1933       0
1934       0
1935       0
1936       0
1937       0
1938       0
1939       0
1940       0
1941       0
1942       0
1943       0
1944       0
1945       0
1946       0
1947       0
1948       0
1949       0
Name: Followers, Length: 1950, dtype: int64

In [10]:
gh_users_followers['nFollowers'] = gh_users_followers['Followers'].apply(lambda x: (len(x) and len(x.split(','))) or len(x))

gh_users_followers

Unnamed: 0,User,Followers,nFollowers
0,tarruda,"Sannis,danielmahon,csjaba,FergusRedican,Victor...",562
1,mairatma,"brunocoelho,henvic,eduardolundgren,aperrelli,a...",351
2,joselitojunior1,"renatooliveira,jeffesonmaia,jotaefe,duartefq,J...",349
3,marcelcaraciolo,"thiagoarrais,brunojm,henriquebastos,macndesign...",332
4,luanfonceca,"brunohenrique,luizvarela,gladson,lucasbibiano,...",303
5,deividazevedo2,"JoaquimCMH,Widancassio,LuizAntonioPS,yhikishim...",268
6,gileno,"brunojm,rafaelcaricio,frenetic,adonescunha,mar...",243
7,simoneas02,"ElsonBarcelos,IsabelaDePaula,sergiockd,douglas...",238
8,filipeximenes,"luisgabriel,renatooliveira,marciobarbosa,anton...",168
9,renatooliveira,"adrianomelo,x8lucas8x,luisgabriel,joselitojuni...",164


In [11]:
gh_users_followers.to_csv('gh_users_followers.csv', index=False, columns=['User', 'nFollowers', 'Followers'])

In [None]:
gh_users_orig = pd.read_csv("gh_users_followers.csv")