## GitHub User Data Scraping

In [1]:
import os
import time
import requests
import requests_cache

import pandas as pd

from dotenv import load_dotenv
load_dotenv()

True

### Testing GH Search API

In [2]:
url = "https://api.github.com/search/users"

params = {
    'q': 'followers:>100 location:Toronto',  # Uses a query string (GraphQL).
    'per_page': 100,
    'page': 1
}
# Authenticated Users: 5,000 requests per hour.
access_token = os.getenv('GITHUB_TOKEN')
headers = {'Authorization': f'token {access_token}'}

response = requests.get(url, params=params, headers=headers)
print('URL:', response.url)

if response.status_code == 200:
    data = response.json()
    print('# users:', len(data['items']))
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

URL: https://api.github.com/search/users?q=followers%3A%3E100+location%3AToronto&per_page=100&page=1
# users: 100


In [3]:
response.links  # feature of requests library

{'next': {'url': 'https://api.github.com/search/users?q=followers%3A%3E100+location%3AToronto&per_page=100&page=2',
  'rel': 'next'},
 'last': {'url': 'https://api.github.com/search/users?q=followers%3A%3E100+location%3AToronto&per_page=100&page=7',
  'rel': 'last'}}

In [4]:
data.keys()

dict_keys(['total_count', 'incomplete_results', 'items'])

In [5]:
data['items'][0]

{'login': 'aneagoie',
 'id': 10776230,
 'node_id': 'MDQ6VXNlcjEwNzc2MjMw',
 'avatar_url': 'https://avatars.githubusercontent.com/u/10776230?v=4',
 'gravatar_id': '',
 'url': 'https://api.github.com/users/aneagoie',
 'html_url': 'https://github.com/aneagoie',
 'followers_url': 'https://api.github.com/users/aneagoie/followers',
 'following_url': 'https://api.github.com/users/aneagoie/following{/other_user}',
 'gists_url': 'https://api.github.com/users/aneagoie/gists{/gist_id}',
 'starred_url': 'https://api.github.com/users/aneagoie/starred{/owner}{/repo}',
 'subscriptions_url': 'https://api.github.com/users/aneagoie/subscriptions',
 'organizations_url': 'https://api.github.com/users/aneagoie/orgs',
 'repos_url': 'https://api.github.com/users/aneagoie/repos',
 'events_url': 'https://api.github.com/users/aneagoie/events{/privacy}',
 'received_events_url': 'https://api.github.com/users/aneagoie/received_events',
 'type': 'User',
 'user_view_type': 'public',
 'site_admin': False,
 'score': 1

### Search for Users

* Toronto users with more than 100 followers

In [6]:
def dynamic_delay(response):
    if response.headers.get('X-RateLimit-Remaining') == '0':
        reset_time = int(response.headers.get('X-RateLimit-Reset'))
        sleep_time = reset_time - int(time.time()) + 5  # Add a buffer of 5s.

        print(f"Rate limit exceeded. Sleeping for {sleep_time} seconds!")
        time.sleep(sleep_time)

    time.sleep(1)  # Sleep for 1s regardless.

In [7]:
printed_message_from_cached = False
requests_cache.install_cache('cache/search_users', expire_after=None)

users = []  # All users in Toronto with more than 100 followers.

url = "https://api.github.com/search/users"
params = {
    'q': 'followers:>100 location:Toronto',  # Uses a query string (GraphQL).
    'per_page': 100,
    'page': 1
}

access_token = os.getenv('GITHUB_TOKEN')
headers = {'Authorization': f'token {access_token}'}

while True:
    response = requests.get(url, params=params, headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch users. Status code: {response.status_code}")
        break

    data = response.json()

    for user in data.get('items', []):
        users.append({
            'login': user['login'],
            'id': user['id'],
            'url': user['url'],
            'repos_url': user['repos_url']
        })
    
    if response.from_cache:
        if not printed_message_from_cached:
            print('Fetched from cache.')
            printed_message_from_cached = True
    else:
        print("Fetched from API:", response.url)
        dynamic_delay(response)

    # Check if there are more pages.
    if 'next' not in response.links:
        break

    params['page'] += 1

print('# of users:', len(users))

Fetched from cache.
# of users: 680


In [8]:
dict(response.headers)

{'Access-Control-Allow-Origin': '*',
 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset',
 'Cache-Control': 'no-cache',
 'Content-Encoding': 'gzip',
 'Content-Security-Policy': "default-src 'none'",
 'Content-Type': 'application/json; charset=utf-8',
 'Date': 'Mon, 21 Oct 2024 16:43:35 GMT',
 'Link': '<https://api.github.com/search/users?q=followers%3A%3E100+location%3AToronto&per_page=100&page=6>; rel="prev", <https://api.github.com/search/users?q=followers%3A%3E100+location%3AToronto&per_page=100&page=1>; rel="first"',
 'Referrer-Policy': 'origin-when-cross-origin, strict-origin-when-cross-origin',
 'Server': 'github.com',
 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload',
 'Transfer-Encoding':

In [9]:
users[0]

{'login': 'aneagoie',
 'id': 10776230,
 'url': 'https://api.github.com/users/aneagoie',
 'repos_url': 'https://api.github.com/users/aneagoie/repos'}

### Fetch User Data

In [10]:
def clean_company_name(company):
    company_name = None
    if company:
        company_name = company.strip()
        if company_name.startswith('@'):
            company_name = company_name[1:]
        company_name = company_name.upper()
    
    return company_name

In [11]:
printed_message_from_cached = False
requests_cache.install_cache('cache/users', expire_after=None)

users_data = []

for user in users:
    response = requests.get(user['url'], headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch {user['login']}'s data. Status code: {response.status_code}")
        break
    
    data = response.json()
    
    users_data.append({
        'login': data['login'],
        'name': data['name'],
        'company': clean_company_name(data['company']),
        'location': data['location'],
        'email': data['email'],
        'hireable': data['hireable'],
        'bio': data['bio'],
        'public_repos': data['public_repos'],
        'followers': data['followers'],
        'following': data['following'],
        'created_at': data['created_at'],
    })

    if response.from_cache:
        if not printed_message_from_cached:
            print('Fetched from cache.')
            printed_message_from_cached = True
    else:
        print('Fetched from API:', response.url)
        dynamic_delay(response)

users_data[0]

Fetched from cache.


{'login': 'aneagoie',
 'name': 'Andrei Neagoie',
 'company': None,
 'location': 'Toronto, Canada',
 'email': None,
 'hireable': True,
 'bio': 'Senior Software Dev turned Instructor. Founder of zerotomastery.io',
 'public_repos': 145,
 'followers': 10268,
 'following': 1,
 'created_at': '2015-01-30T17:05:43Z'}

In [12]:
len(users_data)

680

In [13]:
dict(response.headers)

{'Access-Control-Allow-Origin': '*',
 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset',
 'Cache-Control': 'private, max-age=60, s-maxage=60',
 'Content-Encoding': 'gzip',
 'Content-Security-Policy': "default-src 'none'",
 'Content-Type': 'application/json; charset=utf-8',
 'Date': 'Mon, 21 Oct 2024 17:01:38 GMT',
 'ETag': 'W/"dbbd1c8be0f168814915dde0d8fbc5b716d3913a61c224d395043082b7f75e8b"',
 'Last-Modified': 'Fri, 18 Oct 2024 22:22:40 GMT',
 'Referrer-Policy': 'origin-when-cross-origin, strict-origin-when-cross-origin',
 'Server': 'github.com',
 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload',
 'Transfer-Encoding': 'chunked',
 'Vary': 'Accept, Authorization, Cookie, X-GitHub-OTP,Accept-Encod

#### Save user data to .csv

In [14]:
users_df = pd.DataFrame(users_data)
users_df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,aneagoie,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10268,1,2015-01-30T17:05:43Z
1,ZhangMYihua,Yihua Zhang,,Toronto,yihuazhang2@gmail.com,,Toronto Software Developer,143,5804,11,2015-01-18T00:01:02Z
2,susanli2016,Susan Li,,Toronto Canada,,,Chief Data Scientist,34,4917,68,2016-11-28T04:22:39Z
3,thedaviddias,David Dias,KIJIJICA,"Toronto, Canada",,,💻 Passionate Front-End Dev & 🎨 UI/UX fan. Cont...,89,4533,301,2010-04-05T14:40:12Z
4,ange-yaghi,Ange Yaghi,,Toronto,me@angeyaghi.com,,C++ Developer,32,4020,11,2016-07-13T21:01:21Z


In [15]:
total_number_of_repos_expected = users_df[users_df['public_repos'] <= 500]['public_repos'].sum() + (500 * len(users_df[users_df['public_repos'] > 500]))
print(f"Total Expected Repos: {total_number_of_repos_expected}")

Total Expected Repos: 54875


In [23]:
n_unique_users = len(users_df['login'].unique())  # Sanity check.
print(f"# of unique users: {n_unique_users}")

# of unique users: 680


In [17]:
users_df.to_csv('data/users.csv', index=False)

### Fetch Users' Repo Data

In [18]:
printed_message_from_cached = False
requests_cache.install_cache('cache/repos', expire_after=None)

repos_data = []

for user in users:
    repos = []
    params = {
        'sort': 'pushed',
        'direction': 'desc',
        'per_page': 100,
        'page': 1,
    }
    while True:
        response = requests.get(user['repos_url'], params=params, headers=headers)

        if response.status_code != 200:
            print(f"Failed to fetch { user['login'] }'s repo data. Status code: {response.status_code}")
            break
        
        data = response.json()
        for repo in data:
            repos.append({
                'login': user['login'],
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else None,
            })
        
        if response.from_cache:
            if not printed_message_from_cached:
                print('Fetched from cache.')
                printed_message_from_cached = True
        else:
            print('Fetched from API:', response.url)
            dynamic_delay(response)

        if ('next' not in response.links) or (params['page'] == 5):
            break

        params['page'] += 1
    
    repos_data.extend(repos)

print('# of Repos:', len(repos_data))

Fetched from cache.
# of Repos: 54875


In [19]:
print('Pass repos sanity check:', total_number_of_repos_expected == len(repos_data))

Pass repos sanity check: True


In [20]:
repos_data[0]

{'login': 'aneagoie',
 'full_name': 'aneagoie/ztm-extension',
 'created_at': '2023-12-15T13:32:57Z',
 'stargazers_count': 3,
 'watchers_count': 3,
 'language': 'JavaScript',
 'has_projects': True,
 'has_wiki': False,
 'license_name': 'mit'}

#### Save repo data to .csv

In [21]:
repos_df = pd.DataFrame(repos_data)
repos_df.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,aneagoie,aneagoie/ztm-extension,2023-12-15T13:32:57Z,3,3,JavaScript,True,False,mit
1,aneagoie,aneagoie/ML-Notes,2019-09-28T02:31:10Z,60,60,Jupyter Notebook,True,True,
2,aneagoie,aneagoie/smart-brain-boost-api-dockerized,2018-04-12T18:44:49Z,55,55,JavaScript,True,True,
3,aneagoie,aneagoie/smart-brain-boost-api,2018-04-09T18:46:06Z,14,14,JavaScript,True,True,
4,aneagoie,aneagoie/smart-brain-boost-lambda,2018-04-15T17:02:27Z,20,20,JavaScript,True,True,


In [22]:
repos_df.to_csv('data/repositories.csv')

## Questions