In [1]:
import os
import time
import requests
import requests_cache

import numpy as np
import pandas as pd
from scipy.stats import linregress

from dotenv import load_dotenv
load_dotenv()

True

## GitHub User Data Scraping

### Testing GH Search API

In [2]:
url = "https://api.github.com/search/users"

params = {
    'q': 'followers:>100 location:Toronto',  # Uses a query string (GraphQL).
    'per_page': 100,
    'page': 1
}
# Authenticated Users: 5,000 requests per hour.
access_token = os.getenv('GITHUB_TOKEN')
headers = {'Authorization': f'token {access_token}'}

response = requests.get(url, params=params, headers=headers)
print('URL:', response.url)

if response.status_code == 200:
    data = response.json()
    print('# users:', len(data['items']))
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

URL: https://api.github.com/search/users?q=followers%3A%3E100+location%3AToronto&per_page=100&page=1
# users: 100


In [3]:
response.links  # feature of requests library

{'next': {'url': 'https://api.github.com/search/users?q=followers%3A%3E100+location%3AToronto&per_page=100&page=2',
  'rel': 'next'},
 'last': {'url': 'https://api.github.com/search/users?q=followers%3A%3E100+location%3AToronto&per_page=100&page=7',
  'rel': 'last'}}

In [4]:
data.keys()

dict_keys(['total_count', 'incomplete_results', 'items'])

In [5]:
data['items'][0]

{'login': 'aneagoie',
 'id': 10776230,
 'node_id': 'MDQ6VXNlcjEwNzc2MjMw',
 'avatar_url': 'https://avatars.githubusercontent.com/u/10776230?v=4',
 'gravatar_id': '',
 'url': 'https://api.github.com/users/aneagoie',
 'html_url': 'https://github.com/aneagoie',
 'followers_url': 'https://api.github.com/users/aneagoie/followers',
 'following_url': 'https://api.github.com/users/aneagoie/following{/other_user}',
 'gists_url': 'https://api.github.com/users/aneagoie/gists{/gist_id}',
 'starred_url': 'https://api.github.com/users/aneagoie/starred{/owner}{/repo}',
 'subscriptions_url': 'https://api.github.com/users/aneagoie/subscriptions',
 'organizations_url': 'https://api.github.com/users/aneagoie/orgs',
 'repos_url': 'https://api.github.com/users/aneagoie/repos',
 'events_url': 'https://api.github.com/users/aneagoie/events{/privacy}',
 'received_events_url': 'https://api.github.com/users/aneagoie/received_events',
 'type': 'User',
 'user_view_type': 'public',
 'site_admin': False,
 'score': 1

### Search for Users

* Toronto users with more than 100 followers

In [6]:
def dynamic_delay(response):
    if response.headers.get('X-RateLimit-Remaining') == '0':
        reset_time = int(response.headers.get('X-RateLimit-Reset'))
        sleep_time = reset_time - int(time.time()) + 5  # Add a buffer of 5s.

        print(f"Rate limit exceeded. Sleeping for {sleep_time} seconds!")
        time.sleep(sleep_time)

    time.sleep(1)  # Sleep for 1s regardless.

In [7]:
printed_message_from_cached = False
requests_cache.install_cache('cache/search_users', expire_after=None)

users = []  # All users in Toronto with more than 100 followers.

url = "https://api.github.com/search/users"
params = {
    'q': 'followers:>100 location:Toronto',  # Uses a query string (GraphQL).
    'per_page': 100,
    'page': 1
}

access_token = os.getenv('GITHUB_TOKEN')
headers = {'Authorization': f'token {access_token}'}

while True:
    response = requests.get(url, params=params, headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch users. Status code: {response.status_code}")
        break

    data = response.json()

    for user in data.get('items', []):
        users.append({
            'login': user['login'],
            'id': user['id'],
            'url': user['url'],
            'repos_url': user['repos_url']
        })
    
    if response.from_cache:
        if not printed_message_from_cached:
            print('Fetched from cache.')
            printed_message_from_cached = True
    else:
        print("Fetched from API:", response.url)
        dynamic_delay(response)

    # Check if there are more pages.
    if 'next' not in response.links:
        break

    params['page'] += 1

print('# of users:', len(users))

Fetched from cache.
# of users: 680


In [8]:
dict(response.headers)

{'Access-Control-Allow-Origin': '*',
 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset',
 'Cache-Control': 'no-cache',
 'Content-Encoding': 'gzip',
 'Content-Security-Policy': "default-src 'none'",
 'Content-Type': 'application/json; charset=utf-8',
 'Date': 'Mon, 21 Oct 2024 16:43:35 GMT',
 'Link': '<https://api.github.com/search/users?q=followers%3A%3E100+location%3AToronto&per_page=100&page=6>; rel="prev", <https://api.github.com/search/users?q=followers%3A%3E100+location%3AToronto&per_page=100&page=1>; rel="first"',
 'Referrer-Policy': 'origin-when-cross-origin, strict-origin-when-cross-origin',
 'Server': 'github.com',
 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload',
 'Transfer-Encoding':

In [9]:
users[0]

{'login': 'aneagoie',
 'id': 10776230,
 'url': 'https://api.github.com/users/aneagoie',
 'repos_url': 'https://api.github.com/users/aneagoie/repos'}

### Fetch User Data

In [10]:
def clean_company_name(company):
    company_name = None
    if company:
        company_name = company.strip()
        if company_name.startswith('@'):
            company_name = company_name[1:]
        company_name = company_name.upper()
    
    return company_name

In [11]:
printed_message_from_cached = False
requests_cache.install_cache('cache/users', expire_after=None)

users_data = []

for user in users:
    response = requests.get(user['url'], headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch {user['login']}'s data. Status code: {response.status_code}")
        break
    
    data = response.json()
    
    users_data.append({
        'login': data['login'],
        'name': data['name'],
        'company': clean_company_name(data['company']),
        'location': data['location'],
        'email': data['email'],
        'hireable': data['hireable'],
        'bio': data['bio'],
        'public_repos': data['public_repos'],
        'followers': data['followers'],
        'following': data['following'],
        'created_at': data['created_at'],
    })

    if response.from_cache:
        if not printed_message_from_cached:
            print('Fetched from cache.')
            printed_message_from_cached = True
    else:
        print('Fetched from API:', response.url)
        dynamic_delay(response)

users_data[0]

Fetched from cache.


{'login': 'aneagoie',
 'name': 'Andrei Neagoie',
 'company': None,
 'location': 'Toronto, Canada',
 'email': None,
 'hireable': True,
 'bio': 'Senior Software Dev turned Instructor. Founder of zerotomastery.io',
 'public_repos': 145,
 'followers': 10268,
 'following': 1,
 'created_at': '2015-01-30T17:05:43Z'}

In [12]:
len(users_data)

680

In [13]:
dict(response.headers)

{'Access-Control-Allow-Origin': '*',
 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset',
 'Cache-Control': 'private, max-age=60, s-maxage=60',
 'Content-Encoding': 'gzip',
 'Content-Security-Policy': "default-src 'none'",
 'Content-Type': 'application/json; charset=utf-8',
 'Date': 'Mon, 21 Oct 2024 17:01:38 GMT',
 'ETag': 'W/"dbbd1c8be0f168814915dde0d8fbc5b716d3913a61c224d395043082b7f75e8b"',
 'Last-Modified': 'Fri, 18 Oct 2024 22:22:40 GMT',
 'Referrer-Policy': 'origin-when-cross-origin, strict-origin-when-cross-origin',
 'Server': 'github.com',
 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload',
 'Transfer-Encoding': 'chunked',
 'Vary': 'Accept, Authorization, Cookie, X-GitHub-OTP,Accept-Encod

#### Save user data to .csv

In [14]:
users_df = pd.DataFrame(users_data)
users_df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,aneagoie,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10268,1,2015-01-30T17:05:43Z
1,ZhangMYihua,Yihua Zhang,,Toronto,yihuazhang2@gmail.com,,Toronto Software Developer,143,5804,11,2015-01-18T00:01:02Z
2,susanli2016,Susan Li,,Toronto Canada,,,Chief Data Scientist,34,4917,68,2016-11-28T04:22:39Z
3,thedaviddias,David Dias,KIJIJICA,"Toronto, Canada",,,💻 Passionate Front-End Dev & 🎨 UI/UX fan. Cont...,89,4533,301,2010-04-05T14:40:12Z
4,ange-yaghi,Ange Yaghi,,Toronto,me@angeyaghi.com,,C++ Developer,32,4020,11,2016-07-13T21:01:21Z


In [15]:
total_number_of_repos_expected = users_df[users_df['public_repos'] <= 500]['public_repos'].sum() + (500 * len(users_df[users_df['public_repos'] > 500]))
print(f"Total Expected Repos: {total_number_of_repos_expected}")

Total Expected Repos: 54875


In [16]:
n_unique_users = len(users_df['login'].unique())  # Sanity check.
print(f"# of unique users: {n_unique_users}")

# of unique users: 680


In [17]:
users_df.to_csv('users.csv', index=False)

### Fetch Users' Repo Data

In [18]:
printed_message_from_cached = False
requests_cache.install_cache('cache/repos', expire_after=None)

repos_data = []

for user in users:
    repos = []
    params = {
        'sort': 'pushed',
        'direction': 'desc',
        'per_page': 100,
        'page': 1,
    }
    while True:
        response = requests.get(user['repos_url'], params=params, headers=headers)

        if response.status_code != 200:
            print(f"Failed to fetch { user['login'] }'s repo data. Status code: {response.status_code}")
            break
        
        data = response.json()
        for repo in data:
            repos.append({
                'login': user['login'],
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else None,
            })
        
        if response.from_cache:
            if not printed_message_from_cached:
                print('Fetched from cache.')
                printed_message_from_cached = True
        else:
            print('Fetched from API:', response.url)
            dynamic_delay(response)

        if ('next' not in response.links) or (params['page'] == 5):
            break

        params['page'] += 1
    
    repos_data.extend(repos)

print('# of Repos:', len(repos_data))

Fetched from cache.
# of Repos: 54875


In [19]:
print('Pass repos sanity check:', total_number_of_repos_expected == len(repos_data))

Pass repos sanity check: True


In [20]:
repos_data[0]

{'login': 'aneagoie',
 'full_name': 'aneagoie/ztm-extension',
 'created_at': '2023-12-15T13:32:57Z',
 'stargazers_count': 3,
 'watchers_count': 3,
 'language': 'JavaScript',
 'has_projects': True,
 'has_wiki': False,
 'license_name': 'mit'}

#### Save repo data to .csv

In [21]:
repos_df = pd.DataFrame(repos_data)
repos_df.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,aneagoie,aneagoie/ztm-extension,2023-12-15T13:32:57Z,3,3,JavaScript,True,False,mit
1,aneagoie,aneagoie/ML-Notes,2019-09-28T02:31:10Z,60,60,Jupyter Notebook,True,True,
2,aneagoie,aneagoie/smart-brain-boost-api-dockerized,2018-04-12T18:44:49Z,55,55,JavaScript,True,True,
3,aneagoie,aneagoie/smart-brain-boost-api,2018-04-09T18:46:06Z,14,14,JavaScript,True,True,
4,aneagoie,aneagoie/smart-brain-boost-lambda,2018-04-15T17:02:27Z,20,20,JavaScript,True,True,


In [65]:
repos_df.to_csv('repositories.csv', index=False)

## Questions

Q1. Who are the top 5 users in Toronto with the highest number of followers? List their login in order, comma-separated.

In [23]:
users_df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,aneagoie,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10268,1,2015-01-30T17:05:43Z
1,ZhangMYihua,Yihua Zhang,,Toronto,yihuazhang2@gmail.com,,Toronto Software Developer,143,5804,11,2015-01-18T00:01:02Z
2,susanli2016,Susan Li,,Toronto Canada,,,Chief Data Scientist,34,4917,68,2016-11-28T04:22:39Z
3,thedaviddias,David Dias,KIJIJICA,"Toronto, Canada",,,💻 Passionate Front-End Dev & 🎨 UI/UX fan. Cont...,89,4533,301,2010-04-05T14:40:12Z
4,ange-yaghi,Ange Yaghi,,Toronto,me@angeyaghi.com,,C++ Developer,32,4020,11,2016-07-13T21:01:21Z


In [24]:
users_df.nlargest(n=5, columns='followers')[['login', 'followers']]

Unnamed: 0,login,followers
0,aneagoie,10268
1,ZhangMYihua,5804
2,susanli2016,4917
3,thedaviddias,4533
4,ange-yaghi,4020


In [25]:
','.join(users_df.nlargest(n=5, columns='followers')['login'].to_list())

'aneagoie,ZhangMYihua,susanli2016,thedaviddias,ange-yaghi'

Q2. Who are the 5 earliest registered GitHub users in Toronto? List their login in ascending order of created_at, comma-separated.

In [26]:
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
users_df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,aneagoie,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10268,1,2015-01-30 17:05:43+00:00
1,ZhangMYihua,Yihua Zhang,,Toronto,yihuazhang2@gmail.com,,Toronto Software Developer,143,5804,11,2015-01-18 00:01:02+00:00
2,susanli2016,Susan Li,,Toronto Canada,,,Chief Data Scientist,34,4917,68,2016-11-28 04:22:39+00:00
3,thedaviddias,David Dias,KIJIJICA,"Toronto, Canada",,,💻 Passionate Front-End Dev & 🎨 UI/UX fan. Cont...,89,4533,301,2010-04-05 14:40:12+00:00
4,ange-yaghi,Ange Yaghi,,Toronto,me@angeyaghi.com,,C++ Developer,32,4020,11,2016-07-13 21:01:21+00:00


In [27]:
users_df.iloc[users_df['created_at'].sort_values(ascending=True).index[:5].to_list()][['login', 'created_at']]

Unnamed: 0,login,created_at
428,jamesmacaulay,2008-02-17 20:11:15+00:00
21,michaelklishin,2008-02-27 16:49:40+00:00
522,myles,2008-02-27 22:49:45+00:00
575,nwjsmith,2008-02-28 00:38:55+00:00
78,vito,2008-03-01 18:39:55+00:00


In [28]:
oldest_users = users_df.iloc[users_df['created_at'].sort_values(ascending=True).index[:5].to_list()]['login'].to_list()
','.join(sorted(oldest_users))

'jamesmacaulay,michaelklishin,myles,nwjsmith,vito'

Q3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [29]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,aneagoie,aneagoie/ztm-extension,2023-12-15 13:32:57+00:00,3,3,JavaScript,True,False,mit
1,aneagoie,aneagoie/ML-Notes,2019-09-28 02:31:10+00:00,60,60,Jupyter Notebook,True,True,
2,aneagoie,aneagoie/smart-brain-boost-api-dockerized,2018-04-12 18:44:49+00:00,55,55,JavaScript,True,True,
3,aneagoie,aneagoie/smart-brain-boost-api,2018-04-09 18:46:06+00:00,14,14,JavaScript,True,True,
4,aneagoie,aneagoie/smart-brain-boost-lambda,2018-04-15 17:02:27+00:00,20,20,JavaScript,True,True,


In [30]:
repos_df['license_name'].value_counts(dropna=True)[:3]

license_name
mit           13343
other          4221
apache-2.0     4061
Name: count, dtype: int64

In [31]:
','.join(repos_df['license_name'].value_counts(dropna=True).index[:3].to_list())

'mit,other,apache-2.0'

In [32]:
repos_df['license_name'].isna().sum()

np.int64(27297)

Q4. Which company do the majority of these developers work at?

In [33]:
users_df['company'].value_counts()[:5]

company
UNIVERSITY OF TORONTO    21
SHOPIFY                  16
NVIDIA                    7
GOOGLE                    6
YORK UNIVERSITY           5
Name: count, dtype: int64

In [34]:
users_df['company'].value_counts().index[0]

'UNIVERSITY OF TORONTO'

Q5. Which programming language is most popular among these users?

In [35]:
repos_df['language'].value_counts()[:5]

language
JavaScript    10654
Python         5029
TypeScript     2228
Java           2104
Ruby           2101
Name: count, dtype: int64

In [36]:
repos_df['language'].value_counts().index[0]

'JavaScript'

Q6. Which programming language is the second most popular among users who joined after 2020?

In [37]:
pd.merge?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mmerge[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mleft[0m[0;34m:[0m [0;34m'DataFrame | Series'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright[0m[0;34m:[0m [0;34m'DataFrame | Series'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhow[0m[0;34m:[0m [0;34m'MergeHow'[0m [0;34m=[0m [0;34m'inner'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mon[0m[0;34m:[0m [0;34m'IndexLabel | AnyArrayLike | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mleft_on[0m[0;34m:[0m [0;34m'IndexLabel | AnyArrayLike | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright_on[0m[0;34m:[0m [0;34m'IndexLabel | AnyArrayLike | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mleft_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0

In [38]:
merged = repos_df.merge(users_df, how='inner', on='login', suffixes=('_repo', '_user'))
merged.head()

Unnamed: 0,login,full_name,created_at_repo,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name,name,company,location,email,hireable,bio,public_repos,followers,following,created_at_user
0,aneagoie,aneagoie/ztm-extension,2023-12-15 13:32:57+00:00,3,3,JavaScript,True,False,mit,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10268,1,2015-01-30 17:05:43+00:00
1,aneagoie,aneagoie/ML-Notes,2019-09-28 02:31:10+00:00,60,60,Jupyter Notebook,True,True,,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10268,1,2015-01-30 17:05:43+00:00
2,aneagoie,aneagoie/smart-brain-boost-api-dockerized,2018-04-12 18:44:49+00:00,55,55,JavaScript,True,True,,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10268,1,2015-01-30 17:05:43+00:00
3,aneagoie,aneagoie/smart-brain-boost-api,2018-04-09 18:46:06+00:00,14,14,JavaScript,True,True,,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10268,1,2015-01-30 17:05:43+00:00
4,aneagoie,aneagoie/smart-brain-boost-lambda,2018-04-15 17:02:27+00:00,20,20,JavaScript,True,True,,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10268,1,2015-01-30 17:05:43+00:00


In [39]:
users_df[users_df['created_at'].dt.year > 2020]['public_repos'].sum()

np.int64(640)

In [40]:
merged[merged['created_at_user'].dt.year > 2020]['language'].value_counts()[:5]

language
JavaScript    151
TypeScript     48
HTML           42
CSS            31
Python         30
Name: count, dtype: int64

In [41]:
merged[merged['created_at_user'].dt.year > 2020]['language'].value_counts().index[1]

'TypeScript'

Q7. Which language has the highest average number of stars per repository?

In [42]:
repos_df.groupby('language')['stargazers_count'].mean().sort_values(ascending=False)[:5]

language
Cython            1778.0
Forth             1183.0
ASP.NET            414.0
BrighterScript     311.0
SAS                172.0
Name: stargazers_count, dtype: float64

In [43]:
repos_df.groupby('language')['stargazers_count'].mean().sort_values(ascending=False).index[0]

'Cython'

Q8. Let's define `leader_strength` as `followers / (1 + following)`. Who are the top 5 in terms of `leader_strength`? List their login in order, comma-separated.

In [44]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
users_df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at,leader_strength
0,aneagoie,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10268,1,2015-01-30 17:05:43+00:00,5134.0
1,ZhangMYihua,Yihua Zhang,,Toronto,yihuazhang2@gmail.com,,Toronto Software Developer,143,5804,11,2015-01-18 00:01:02+00:00,483.666667
2,susanli2016,Susan Li,,Toronto Canada,,,Chief Data Scientist,34,4917,68,2016-11-28 04:22:39+00:00,71.26087
3,thedaviddias,David Dias,KIJIJICA,"Toronto, Canada",,,💻 Passionate Front-End Dev & 🎨 UI/UX fan. Cont...,89,4533,301,2010-04-05 14:40:12+00:00,15.009934
4,ange-yaghi,Ange Yaghi,,Toronto,me@angeyaghi.com,,C++ Developer,32,4020,11,2016-07-13 21:01:21+00:00,335.0


In [45]:
users_df.nlargest(5, 'leader_strength')[['login', 'leader_strength']]

Unnamed: 0,login,leader_strength
0,aneagoie,5134.0
6,nayuki,3541.0
7,GrapheneOS,3494.0
11,hlissner,2418.0
15,rspivak,2178.0


In [46]:
','.join(users_df.nlargest(5, 'leader_strength')['login'].to_list())

'aneagoie,nayuki,GrapheneOS,hlissner,rspivak'

Q9. What is the correlation between the number of followers and the number of public repositories among users in Toronto?

In [47]:
users_df['followers'].corr(users_df['public_repos']).round(3)

np.float64(0.055)

Q10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [48]:
users_df[['public_repos', 'followers']].isna().sum()

public_repos    0
followers       0
dtype: int64

In [49]:
linregress(users_df['public_repos'], users_df['followers']).slope.round(3)

np.float64(0.25)

Q11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [50]:
repos_df[['has_projects', 'has_wiki']].corr().round(3)

Unnamed: 0,has_projects,has_wiki
has_projects,1.0,0.372
has_wiki,0.372,1.0


Q12. Do hireable users follow more people than those who are not hireable?

In [51]:
users_df['hireable'].value_counts(dropna=False)

hireable
None    456
True    224
Name: count, dtype: int64

In [52]:
users_df[users_df['hireable'].notnull()]['following'].mean()

np.float64(112.78125)

In [53]:
users_df[users_df['hireable'].isnull()]['following'].mean()

np.float64(125.64254385964912)

In [54]:
(users_df[users_df['hireable'].notnull()]['following'].mean() - users_df[users_df['hireable'].isnull()]['following'].mean()).round(3)

np.float64(-12.861)

Q13. Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode characters) with followers? (Ignore people without bios)

In [55]:
users_with_bio = users_df.loc[users_df['bio'].notnull(), :].copy()
users_with_bio['bio_length'] = users_with_bio['bio'].str.len()

In [56]:
linregress(users_with_bio['bio_length'], users_with_bio['followers']).slope.round(3)

np.float64(1.416)

Q14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [57]:
repos_df['created_at'].dt.weekday.value_counts().sort_index()

created_at
0    8046
1    8730
2    8839
3    8460
4    8142
5    6326
6    6332
Name: count, dtype: int64

In [58]:
repos_df[repos_df['created_at'].dt.weekday >= 5].groupby('login').size().sort_values(ascending=False)[:5]

login
andyw8             180
QuinntyneBrown     130
invokethreatguy    128
rgrinberg          122
Devang-25          121
dtype: int64

In [59]:
','.join(repos_df[repos_df['created_at'].dt.weekday >= 5].groupby('login').size().sort_values(ascending=False)[:5].index.to_list())

'andyw8,QuinntyneBrown,invokethreatguy,rgrinberg,Devang-25'

Q15. Do people who are hireable share their email addresses more often?

[fraction of users with email when hireable=true] minus [fraction of users with email for the rest]

In [60]:
users_df[(users_df['hireable'].notnull()) & (users_df['email'].notnull())].shape

(139, 12)

In [61]:
users_df[(users_df['email'].notnull())].shape

(363, 12)

In [62]:
round((users_df[(users_df['hireable'].notnull()) & (users_df['email'].notnull())].shape[0] / users_df.shape[0]) - (users_df[(users_df['email'].notnull())].shape[0] / users_df.shape[0]), 3)

-0.329

Q16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [63]:
users_df['name'].str.strip().str.split().str[-1].value_counts()[:5]

name
Ahmed    4
Smith    3
Brown    3
Chen     3
Kumar    3
Name: count, dtype: int64

In [64]:
users_df['name'].str.strip().str.split().str[-1].value_counts().index[0]

'Ahmed'