In [None]:
import os
import time
import requests
import requests_cache

import numpy as np
import pandas as pd
from scipy.stats import linregress

from dotenv import load_dotenv
load_dotenv()

## GitHub User Data Scraping

### Testing GH Search API

In [None]:
url = "https://api.github.com/search/users"

params = {
    'q': 'followers:>100 location:Toronto',  # Uses a query string (GraphQL).
    'per_page': 100,
    'page': 1
}
# Authenticated Users: 5,000 requests per hour.
access_token = os.getenv('GITHUB_TOKEN')
headers = {'Authorization': f'token {access_token}'}

response = requests.get(url, params=params, headers=headers)
print('URL:', response.url)

if response.status_code == 200:
    data = response.json()
    print('# users:', len(data['items']))
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

In [None]:
response.links  # feature of requests library

In [None]:
data.keys()

In [None]:
data['items'][0]

### Search for Users

* Toronto users with more than 100 followers

In [None]:
def dynamic_delay(response):
    if response.headers.get('X-RateLimit-Remaining') == '0':
        reset_time = int(response.headers.get('X-RateLimit-Reset'))
        sleep_time = reset_time - int(time.time()) + 5  # Add a buffer of 5s.

        print(f"Rate limit exceeded. Sleeping for {sleep_time} seconds!")
        time.sleep(sleep_time)

    time.sleep(1)  # Sleep for 1s regardless.

In [None]:
printed_message_from_cached = False
requests_cache.install_cache('cache/search_users', expire_after=None)

users = []  # All users in Toronto with more than 100 followers.

url = "https://api.github.com/search/users"
params = {
    'q': 'followers:>100 location:Toronto',  # Uses a query string (GraphQL).
    'per_page': 100,
    'page': 1
}

access_token = os.getenv('GITHUB_TOKEN')
headers = {'Authorization': f'token {access_token}'}

while True:
    response = requests.get(url, params=params, headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch users. Status code: {response.status_code}")
        break

    data = response.json()

    for user in data.get('items', []):
        users.append({
            'login': user['login'],
            'id': user['id'],
            'url': user['url'],
            'repos_url': user['repos_url']
        })
    
    if response.from_cache:
        if not printed_message_from_cached:
            print('Fetched from cache.')
            printed_message_from_cached = True
    else:
        print("Fetched from API:", response.url)
        dynamic_delay(response)

    # Check if there are more pages.
    if 'next' not in response.links:
        break

    params['page'] += 1

print('# of users:', len(users))

In [None]:
dict(response.headers)

In [None]:
users[0]

### Fetch User Data

In [None]:
def clean_company_name(company):
    company_name = None
    if company:
        company_name = company.strip()
        if company_name.startswith('@'):
            company_name = company_name[1:]
        company_name = company_name.upper()
    
    return company_name

In [None]:
printed_message_from_cached = False
requests_cache.install_cache('cache/users', expire_after=None)

users_data = []

for user in users:
    response = requests.get(user['url'], headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch {user['login']}'s data. Status code: {response.status_code}")
        break
    
    data = response.json()
    
    users_data.append({
        'login': data['login'],
        'name': data['name'],
        'company': clean_company_name(data['company']),
        'location': data['location'],
        'email': data['email'],
        'hireable': data['hireable'],
        'bio': data['bio'],
        'public_repos': data['public_repos'],
        'followers': data['followers'],
        'following': data['following'],
        'created_at': data['created_at'],
    })

    if response.from_cache:
        if not printed_message_from_cached:
            print('Fetched from cache.')
            printed_message_from_cached = True
    else:
        print('Fetched from API:', response.url)
        dynamic_delay(response)

users_data[0]

In [None]:
len(users_data)

In [None]:
dict(response.headers)

#### Save user data to .csv

In [None]:
users_df = pd.DataFrame(users_data)
users_df.head()

In [None]:
total_number_of_repos_expected = users_df[users_df['public_repos'] <= 500]['public_repos'].sum() + (500 * len(users_df[users_df['public_repos'] > 500]))
print(f"Total Expected Repos: {total_number_of_repos_expected}")

In [None]:
n_unique_users = len(users_df['login'].unique())  # Sanity check.
print(f"# of unique users: {n_unique_users}")

In [None]:
users_df.to_csv('users.csv', index=False)

### Fetch Users' Repo Data

In [None]:
printed_message_from_cached = False
requests_cache.install_cache('cache/repos', expire_after=None)

repos_data = []

for user in users:
    repos = []
    params = {
        'sort': 'pushed',
        'direction': 'desc',
        'per_page': 100,
        'page': 1,
    }
    while True:
        response = requests.get(user['repos_url'], params=params, headers=headers)

        if response.status_code != 200:
            print(f"Failed to fetch { user['login'] }'s repo data. Status code: {response.status_code}")
            break
        
        data = response.json()
        for repo in data:
            repos.append({
                'login': user['login'],
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else None,
            })
        
        if response.from_cache:
            if not printed_message_from_cached:
                print('Fetched from cache.')
                printed_message_from_cached = True
        else:
            print('Fetched from API:', response.url)
            dynamic_delay(response)

        if ('next' not in response.links) or (params['page'] == 5):
            break

        params['page'] += 1
    
    repos_data.extend(repos)

print('# of Repos:', len(repos_data))

In [None]:
print('Pass repos sanity check:', total_number_of_repos_expected == len(repos_data))

In [None]:
repos_data[0]

#### Save repo data to .csv

In [None]:
repos_df = pd.DataFrame(repos_data)
repos_df.head()

In [None]:
repos_df.to_csv('repositories.csv')

## Questions

Q1. Who are the top 5 users in Toronto with the highest number of followers? List their login in order, comma-separated.

In [None]:
users_df.head()

In [None]:
','.join(users_df.nlargest(n=5, columns='followers')['login'].to_list())

Q2. Who are the 5 earliest registered GitHub users in Toronto? List their login in ascending order of created_at, comma-separated.

In [None]:
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
users_df.head()

In [None]:
oldest_users = users_df.iloc[users_df['created_at'].sort_values(ascending=True).index[:5].to_list()]['login'].to_list()
','.join(sorted(oldest_users))

Q3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [None]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df.head()

In [None]:
','.join(repos_df['license_name'].value_counts(dropna=True).index[:3].to_list())

In [None]:
repos_df['license_name'].isna().sum()

Q4. Which company do the majority of these developers work at?

In [None]:
users_df['company'].value_counts().index[0]

Q5. Which programming language is most popular among these users?

In [None]:
repos_df['language'].value_counts().index[0]

Q6. Which programming language is the second most popular among users who joined after 2020?

In [None]:
pd.merge?

In [None]:
merged = repos_df.merge(users_df, how='inner', on='login', suffixes=('_repo', '_user'))
merged.head()

In [None]:
users_df[users_df['created_at'].dt.year > 2020]['public_repos'].sum()

In [None]:
merged[merged['created_at_user'].dt.year > 2020]['language'].value_counts().index[1]

Q7. Which language has the highest average number of stars per repository?

In [None]:
repos_df.groupby('language')['stargazers_count'].mean().sort_values(ascending=False).index[0]

Q8. Let's define `leader_strength` as `followers / (1 + following)`. Who are the top 5 in terms of `leader_strength`? List their login in order, comma-separated.

In [None]:
strong_leaders_idx = (users_df['followers'] / (1 + users_df['following'])).sort_values(ascending=False).index[:5]

','.join(users_df.loc[strong_leaders_idx, 'login'].to_list())

Q9. What is the correlation between the number of followers and the number of public repositories among users in Toronto?

In [None]:
users_df['followers'].corr(users_df['public_repos']).round(3)

Q10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [None]:
users_df[['public_repos', 'followers']].isna().sum()

In [None]:
linregress(users_df['public_repos'], users_df['followers']).slope.round(3)

Q11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [None]:
repos_df[['has_projects', 'has_wiki']].corr().iloc[0, 1].round(3)

Q12. Do hireable users follow more people than those who are not hireable?

In [None]:
users_df['hireable'].value_counts(dropna=False)

In [None]:
users_df[users_df['hireable'] == True]['following'].mean()

In [None]:
users_df[users_df['hireable'] != True]['following'].mean()

In [None]:
(users_df[users_df['hireable'] == True]['following'].mean() - users_df[users_df['hireable'] != True]['following'].mean()).round(3)

Q13. Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode characters) with followers? (Ignore people without bios)

In [None]:
users_with_bio = users_df.loc[~users_df['bio'].isna(), :].copy()
users_with_bio['bio_length'] = users_with_bio['bio'].str.len()

In [None]:
linregress(users_with_bio['bio_length'], users_with_bio['followers']).slope.round(3)