In [115]:
import requests
import pandas as pd
import time
from datetime import datetime
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import linregress

# GitHub API authentication (use your personal access token)
GITHUB_TOKEN = "fill your token here"
headers = {'Authorization': f'token {GITHUB_TOKEN}'}

In [64]:
def get_berlin_users():
    users = []
    page = 1
    per_page = 50  # GitHub allows fetching 100 results per page

    while True:
        # Search users in Berlin with more than 200 followers
        url = f"https://api.github.com/search/users?q=location:Berlin+followers:>200&per_page={per_page}&page={page}"
        response = requests.get(url, headers=headers)
        data = response.json()

        # Exit if no more users are found
        if 'items' not in data or len(data['items']) == 0:
            break

        users.extend(data['items'])
        page += 1
        # Pause to avoid rate-limiting
        time.sleep(1)

    return users

In [65]:
def get_user_details(user_login):
    # Fetch user details
    user_url = f"https://api.github.com/users/{user_login}"
    user_response = requests.get(user_url, headers=headers)
    user_data = user_response.json()

    return user_data

In [66]:
def get_user_repositories_details(user_login):
    # Fetch user repositories
    repos_url = f"https://api.github.com/users/{user_login}/repos?sort=pushed&per_page=500"
    repos_response = requests.get(repos_url, headers=headers)
    repos_data = repos_response.json()

    # return user_data, repos_data
    return repos_data

In [67]:
def clean_company_name(company):
    if company:
        company = company.strip()  # Remove leading/trailing whitespace
        company = company.lstrip('@')  # Remove leading '@'
        company = company.strip()  # Remove leading/trailing whitespace
        company = company.upper()  # Convert to uppercase
    return company

In [71]:
def scrape_berlin_users_to_csv():
    user_list = []
    for user in users:
        user_login = user['login']

        # Get detailed user info and repos
        # user_data, repos_data = get_user_details(user_login)

        user_data = get_user_details(user_login)

        user_info = {
            'login': user_data.get('login'),
            'name': user_data.get('name'),
            'company': clean_company_name(user_data.get('company')),
            'location': user_data.get('location'),
            'email': user_data.get('email'),
            'hireable': 'true' if user_data.get('hireable') else 'false',
            'bio': user_data.get('bio'),
            'public_repos': user_data.get('public_repos'),
            'followers': user_data.get('followers'),
            'following': user_data.get('following'),
            'created_at': user_data.get('created_at')
        }
        user_list.append(user_info)

        time.sleep(1)

    # Create DataFrame and save to CSV
    df_users = pd.DataFrame(user_list)
    df_users.to_csv('users.csv', index=False)
    print("Data saved to users.csv")

In [75]:
def scrape_berlin_users_repositories_to_csv():
    user_repo_data = []
    for user in users:
        user_login = user['login']

        # Get detailed user info and repos
        # user_data, repos_data = get_user_details(user_login)

        repo_data = get_user_repositories_details(user_login)

        for repo in repo_data:
          repo_info = {
              'login': user_login,
              'full_name': repo.get('full_name'),
              'created_at': repo.get('created_at'),
              'stargazers_count': repo.get('stargazers_count'),
              'watchers_count': repo.get('watchers_count'),
              'language': repo.get('language'),
              'has_projects': 'true' if repo.get('has_projects') else 'false',
              'has_wiki': 'true' if repo.get('has_wiki') else 'false',
              'license_name' : repo.get('license', {}).get('key') if repo.get('license') else None
          }
          user_repo_data.append(repo_info)

    # Create DataFrame and save to CSV
    df_users_repo = pd.DataFrame(user_repo_data)
    df_users_repo.to_csv('repositories.csv', index=False)
    print("Data saved to repositories.csv")

In [68]:
users = get_berlin_users()

In [None]:
scrape_berlin_users_to_csv()

In [None]:
scrape_berlin_users_repositories_to_csv()

### Execute the next step if the users and repositories data was already scraped and kept in drive

In [77]:
from google.colab import drive
drive.mount('/content/drive')
df_users = pd.read_csv('/content/drive/My Drive/users.csv')
df_repositories = pd.read_csv('/content/drive/My Drive/repositories.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Execute this step if users and repositories data was scraped now

In [None]:
df_users = pd.read_csv('users.csv')
df_repositories = pd.read_csv('repositories.csv')

### Question 1

In [22]:
top5followers = df_users.sort_values(by=['followers'],ascending=False)['login'][0:5]
",".join(list(top5followers))

'tiangolo,schacon,rwieruch,shuding,android10'

### Question 2

In [25]:
early5creators = df_users.sort_values(by=['created_at'],ascending=True)['login'][0:5]
",".join(list(early5creators))

'schacon,adamwiggins,myobie,lstoll,znarf'

### Question 3

In [83]:
','.join(df_repositories['license_name'].value_counts().head(3).index)

'mit,apache-2.0,other'

### Question 4

In [88]:
','.join(df_users['company'].value_counts().head(1).index)

'MICROSOFT'

### Question 5

In [89]:
df_repositories['language'].value_counts().head(1).index[0]

'JavaScript'

### Question 6

In [90]:
df_repositories[df_repositories['login'].isin(df_users[pd.to_datetime(df_users['created_at']).dt.year > 2020]['login'])]['language'].value_counts().head(2).index[-1]

'JavaScript'

### Question 7

In [91]:
pivot_table = pd.pivot_table(df_repositories,index='language',values='stargazers_count',aggfunc='mean',sort=True)
sorted_pivot = pivot_table.sort_values(by='stargazers_count', ascending=False)
sorted_pivot.index[0]

'Fluent'

### Question 8

In [92]:
df_users['leader_strength'] = df_users['followers']/(1+df_users['following'])
",".join(df_users.sort_values(by='leader_strength', ascending=False)['login'].head(5))

'tiangolo,marijnh,vakila,alexeygrigorev,lewagon'

### Question 9

In [94]:
np.round(df_users['followers'].corr(df_users['public_repos']),3)

0.016

### Question 10

In [96]:
model = LinearRegression()

In [98]:
X = df_users[['public_repos']]
y = df_users['followers']
np.round(model.fit(X,y).coef_[0],3)

0.277

In [99]:
slope, intercept, r_value, p_value, std_err = linregress(df_users['public_repos'], df_users['followers'])
np.round(slope,3)

0.277

### Question 11

In [100]:
np.round(df_repositories['has_projects'].corr(df_repositories['has_wiki']),3)

0.408

### Question 12

In [107]:
df_users['hireable'] = df_users['hireable'].fillna(False)
np.round((df_users[df_users['hireable']]['following'].mean() - df_users[~df_users['hireable']]['following'].mean()),3)

47.392

### Question 14

In [110]:
df_repositories['created_at'] = pd.to_datetime(df_repositories['created_at'])
df_repositories['is_weekend'] = df_repositories['created_at'].dt.dayofweek.isin([5, 6])
",".join(pd.pivot_table(df_repositories[df_repositories['is_weekend']],values='created_at',index='login',aggfunc='count').rename(columns={'created_at': 'weekend_repo_count'}).sort_values(by='weekend_repo_count', ascending=False).index[0:5])

'janpio,denisdefreyne,MohamedMesto,PurpleBooth,generall'

### Question 15

In [113]:
hireable_true = df_users[df_users['hireable'] == True]
fraction_hireable_true = hireable_true['email'].notna().mean()

hireable_other = df_users[df_users['hireable'] != True]
fraction_hireable_other = hireable_other['email'].notna().mean()

print(f"{(fraction_hireable_true - fraction_hireable_other):.3f}")

-0.010


### Question 16

In [114]:
df_users['name'].str.strip().str.split().str[-1].value_counts().index[0]

'Schneider'