In [None]:
!pip install requests pandas



In [109]:
import requests
import pandas as pd
import time

# GitHub token for authentication (replace 'your_token_here' with a valid GitHub token)
GITHUB_TOKEN = "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"
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}

# URLs for the GitHub API
BASE_URL = "https://api.github.com"
USER_SEARCH_URL = f"{BASE_URL}/search/users"
REPOS_URL_TEMPLATE = f"{BASE_URL}/users/{{login}}/repos"

# Data containers
user_data = []
repo_data = []

# Step 1: Fetch Users in Seattle with >200 Followers
def fetch_users():
    page = 1
    while True:
        response = requests.get(
            USER_SEARCH_URL,
            headers=HEADERS,
            params={
                'q': 'location:seattle followers:>200',
                'per_page': 100,
                'page': page
            }
        )
        data = response.json()

        # Handle rate limits or errors
        if response.status_code == 403:
            print("Rate limit reached. Waiting for 60 seconds.")
            time.sleep(60)  # Wait 60 seconds before retrying
            continue
        elif 'items' not in data:
            print("Error:", data)
            break

        # Append users to user_data
        users = data['items']
        if not users:
            break

        for user in users:
            # Fetching detailed user data
            user_detail = requests.get(user['url'], headers=HEADERS).json()
            company = user_detail.get('company', '')
            if company:
                company = company.lstrip('@').strip().upper()
            user_data.append({
                "login": user_detail.get("login", ""),
                "name": user_detail.get("name", ""),
                "company": company,
                "location": user_detail.get("location", ""),
                "email": user_detail.get("email", ""),
                "hireable": user_detail.get("hireable", ""),
                "bio": user_detail.get("bio", ""),
                "public_repos": user_detail.get("public_repos", ""),
                "followers": user_detail.get("followers", ""),
                "following": user_detail.get("following", ""),
                "created_at": user_detail.get("created_at", "")
            })

        # Go to the next page
        page += 1
        if len(users) < 100:
            break

# Step 2: Fetch Repositories for Each User
def fetch_repositories():
    for user in user_data:
        login = user['login']
        page = 1
        while True:
            response = requests.get(
                REPOS_URL_TEMPLATE.format(login=login),
                headers=HEADERS,
                params={'per_page': 100, 'page': page}
            )
            repos = response.json()

            # Handle rate limits or errors
            if response.status_code == 403:
                print("Rate limit reached. Waiting for 60 seconds.")
                time.sleep(60)  # Wait 60 seconds before retrying
                continue
            elif isinstance(repos, dict) and 'message' in repos:
                print(f"Error fetching repos for user {login}: {repos['message']}")
                break

            # Append repositories to repo_data
            if not repos:
                break

            for repo in repos:
                repo_data.append({
                    "login": login,
                    "full_name": repo.get("full_name", ""),
                    "created_at": repo.get("created_at", ""),
                    "stargazers_count": repo.get("stargazers_count", 0),
                    "watchers_count": repo.get("watchers_count", 0),
                    "language": repo.get("language", ""),
                    "has_projects": repo.get("has_projects", False),
                    "has_wiki": repo.get("has_wiki", False),
                    "license_name": repo.get("license", {}).get("key", "") if repo.get("license") else ""
                })

            # Go to the next page
            page += 1
            if len(repos) < 100:
                break

# Step 3: Save Data to CSV
def save_data():
    # Save users data to users.csv
    users_df = pd.DataFrame(user_data)
    users_df.to_csv("users.csv", index=False)

    # Save repositories data to repositories.csv
    repos_df = pd.DataFrame(repo_data)
    repos_df.to_csv("repositories.csv", index=False)

# Main execution
def main():
    print("Fetching users...")
    fetch_users()
    print(f"Fetched {len(user_data)} users.")

    print("Fetching repositories for each user...")
    fetch_repositories()
    print(f"Fetched {len(repo_data)} repositories.")

    print("Saving data to CSV files...")
    save_data()
    print("Data saved successfully.")

if __name__ == "__main__":
    main()


Fetching users...
Fetched 518 users.
Fetching repositories for each user...
Fetched 52488 repositories.
Saving data to CSV files...
Data saved successfully.


In [110]:
users_df=pd.read_csv('/content/users.csv')
repos_df=pd.read_csv('/content/repositories.csv')

In [111]:
users_df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,vczh,,,"Seattle, WA, USA",,,Main contributor of @vczh-libraries .\r\n\r\n...,12,17512,11,2011-05-07T08:30:48Z
1,bradfitz,Brad Fitzpatrick,TAILSCALE,Seattle,brad@danga.com,,"LiveJournal, memcached, OpenID, @golang team (...",141,12107,41,2008-03-09T05:08:14Z
2,munificent,Bob Nystrom,"GOOGLE, ON @DART-LANG","Seattle, WA",,,"Programming language developer, ex-game develo...",49,9929,144,2009-01-13T15:37:46Z
3,tenderlove,Aaron Patterson,SHOPIFY,Seattle,,,üíòüíôüíúüíóüíö‚ù§üíìüíõüíöüíó,357,9447,27,2008-03-14T20:04:17Z
4,ahmetb,Ahmet Alp Balkan,LINKEDIN,"Seattle, WA",github@ahmet.im,True,Working on compute orchestration with Kubernet...,221,8213,34,2009-11-28T14:59:59Z


In [112]:
top_5_users = users_df.sort_values(by=['followers'], ascending=False).head(5)['login'].tolist()
print(','.join(top_5_users))

vczh,bradfitz,munificent,tenderlove,ahmetb


In [113]:
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
earliest_5_users = users_df.sort_values(by=['created_at']).head(5)['login'].tolist()
print(','.join(earliest_5_users))

topfunky,nex3,beccasaurus,eric,grantr


In [114]:
filtered_repos_df = repos_df.dropna(subset=['license_name'])
popular_licenses = filtered_repos_df.groupby('license_name')['license_name'].count().sort_values(ascending=False).head(3).index.tolist()
print(','.join(popular_licenses))

mit,apache-2.0,other


In [115]:
majority_company = users_df['company'].mode()[0]
print(majority_company)


MICROSOFT


In [116]:
popular_language = repos_df['language'].value_counts().idxmax()
print(popular_language)

JavaScript


In [117]:
'''
recent_users = users[users['created_at'] > '2020-01-01']
recent_repos = repos[repos['login'].isin(recent_users['login'])]
second_popular_language = recent_repos['language'].value_counts().index[1]
print(second_popular_language)
'''
recent_users = users_df[users_df['created_at'] > '2020-01-01']
recent_repos = repos_df[repos_df['login'].isin(recent_users['login'])]

# Check if there are at least two languages
language_counts = recent_repos['language'].value_counts()
if len(language_counts) > 1:
    second_popular_language = language_counts.index[1]
    print(second_popular_language)

Python


In [118]:
avg_stars = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
print(avg_stars)


Haml


In [119]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)
print(','.join(top_leaders['login']))

awslabs,mission-peace,karan,cmuratori,nex3


In [120]:
correlation = users_df['followers'].corr(users_df['public_repos'])
print(f'{correlation:.3f}')


0.203


In [121]:
from scipy.stats import linregress

slope, intercept, r_value, p_value, std_err = linregress(users_df['public_repos'], users_df['followers'])
print(f'{slope:.3f}')

2.499


In [128]:
#projects_wiki_corr = repos_df['has_projects'].corr(repos_df['has_wiki'])
# Convert boolean columns to integers for correlation calculation
repos_df['has_projects'] = repos_df['has_projects'].astype(int)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(int)

# Calculate correlation
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Print the correlation rounded to three decimal places
print(f"Correlation between projects enabled and wiki enabled: {correlation:.3f}")
print(f'{projects_wiki_corr:.3f}')

Correlation between projects enabled and wiki enabled: 0.310
0.310


In [132]:
# Replace NaN values with False before calculating the mean
average_hireable_following = users_df[users_df['hireable'].fillna(False)]['following'].mean()
average_non_hireable_following = users_df[~users_df['hireable'].fillna(False)]['following'].mean()

# Calculate the difference
difference = round(average_hireable_following - average_non_hireable_following, 3)

# Print the result
print(f"Difference in following between hireable and non-hireable users: {difference:.3f}")

Difference in following between hireable and non-hireable users: 12.963


  average_hireable_following = users_df[users_df['hireable'].fillna(False)]['following'].mean()
  average_non_hireable_following = users_df[~users_df['hireable'].fillna(False)]['following'].mean()


In [123]:
hireable_following = users_df[users_df['hireable'] == True]['following'].mean()
non_hireable_following = users_df[users_df['hireable'] == False]['following'].mean()
diff_following = hireable_following - non_hireable_following
print(f'{diff_following:.3f}')

nan


In [124]:
users_df['bio_length'] = users_df['bio'].str.len()
bio_followers_slope, _, _, _, _ = linregress(users_df.dropna(subset=['bio_length'])['bio_length'], users_df.dropna(subset=['bio_length'])['followers'])
print(f'{bio_followers_slope:.3f}')

1.011


In [125]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['weekday'] = repos_df['created_at'].dt.weekday
weekend_repos = repos_df[repos_df['weekday'] >= 5]
top_weekend_creators = weekend_repos['login'].value_counts().head(5)
print(','.join(top_weekend_creators.index))

schollz,ingydotnet,nolanlawson,homebysix,arokem


In [130]:
fraction_hireable_with_email = users_df[(users_df['hireable'] == True) & (users_df['email'].notnull())].shape[0] / users_df[users_df['hireable'] == True].shape[0] if users_df[users_df['hireable'] == True].shape[0] > 0 else 0
fraction_non_hireable_with_email = users_df[(users_df['hireable'] == False) & (users_df['email'].notnull())].shape[0] / users_df[users_df['hireable'] == False].shape[0] if users_df[users_df['hireable'] == False].shape[0] > 0 else 0

# Calculate the difference
difference = round(fraction_hireable_with_email - fraction_non_hireable_with_email, 3)

# Print the result
print(f"Difference in email sharing between hireable and non-hireable users: {difference:.3f}")

Difference in email sharing between hireable and non-hireable users: 0.598


In [127]:
users_df['surname'] = users_df['name'].str.split().str[-1]
common_surname = users_df['surname'].mode()[0]
print(common_surname)

Wang
