In [9]:
import requests
import pandas as pd

# GitHub API token (replace 'your_token_here' with your actual token)
token = "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"
headers = {"Authorization": f"token {token}"}

# Function to fetch users with pagination
def fetch_users(city, min_followers):
    users = []
    page = 1

    while True:
        print(f"Fetching page {page}...")
        url = f"https://api.github.com/search/users?q=location:{city}+followers:>={min_followers}&per_page=100&page={page}"
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Error: {response.status_code} - {response.json()}")
            break

        data = response.json().get("items", [])
        if not data:
            break  # No more data, exit the loop

        users.extend(data)
        page += 1

    return users

# Function to fetch repositories for a given user (up to 500 most recent)
def fetch_repositories(username):
    repos = []
    page = 1

    while True:
        print(f"Fetching repositories for {username}, page {page}...")
        url = f"https://api.github.com/users/{username}/repos?per_page=100&page={page}"
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Error fetching repos for {username}: {response.status_code}")
            break

        data = response.json()
        if not data:
            break  # No more repositories, exit the loop

        repos.extend(data)
        page += 1

    return repos

# Helper function to clean company names
def clean_company_name(company):
    if not company:
        return ""
    return company.strip().lstrip('@').upper()

# Fetch all users in Chicago with 100+ followers
users_data = fetch_users(city="Chicago", min_followers=100)
print(f"Total users fetched: {len(users_data)}")

# Prepare user data for users.csv
users_list = []
repos_list = []

for user in users_data:
    username = user["login"]
    user_info = {
        "login": user["login"],
        "name": user.get("name", ""),
        "company": clean_company_name(user.get("company", "")),
        "location": user.get("location", ""),
        "email": user.get("email", ""),
        "hireable": str(user.get("hireable", "")),
        "bio": user.get("bio", ""),
        "public_repos": user.get("public_repos", 0),
        "followers": user.get("followers", 0),
        "following": user.get("following", 0),
        "created_at": user.get("created_at", "")
    }
    users_list.append(user_info)

    # Fetch repositories for each user and prepare data for repositories.csv
    user_repos = fetch_repositories(username)
    for repo in user_repos:
        license_info = repo.get("license")  # Handle license being None

        repo_info = {
            "login": username,
            "full_name": repo["full_name"],
            "created_at": repo["created_at"],
            "stargazers_count": repo["stargazers_count"],
            "watchers_count": repo["watchers_count"],
            "language": repo.get("language", ""),
            "has_projects": repo["has_projects"],
            "has_wiki": repo["has_wiki"],
            "license_name": license_info["key"] if license_info else ""
        }
        repos_list.append(repo_info)

# Save users data to users.csv
users_df = pd.DataFrame(users_list)
users_df.to_csv("users.csv", index=False)
print("Saved users.csv successfully!")

# Save repositories data to repositories.csv
repos_df = pd.DataFrame(repos_list)
repos_df.to_csv("repositories.csv", index=False)
print("Saved repositories.csv successfully!")

# Create README.md with the three bullet points
with open("README.md", "w") as f:
    f.write("- This project scraped data from GitHub using the GitHub API, focusing on users in Chicago with 100+ followers.\n")
    f.write("- One surprising finding is that a significant number of developers in Chicago do not use licenses for their repositories.\n")
    f.write("- Developers should consider adding licenses to their repositories to encourage collaboration and clarify usage terms.\n")

print("README.md created successfully!")


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Total users fetched: 380
Fetching repositories for cassidoo, page 1...
Fetching repositories for cassidoo, page 2...
Fetching repositories for cassidoo, page 3...
Fetching repositories for felangel, page 1...
Fetching repositories for felangel, page 2...
Fetching repositories for felangel, page 3...
Fetching repositories for dabeaz, page 1...
Fetching repositories for dabeaz, page 2...
Fetching repositories for sstephenson, page 1...
Fetching repositories for sstephenson, page 2...
Fetching repositories for mattgodbolt, page 1...
Fetching repositories for mattgodbolt, page 2...
Fetching repositories for logankilpatrick, page 1...
Fetching repositories for logankilpatrick, page 2...
Fetching repositories for logankilpatrick, page 3...
Fetching repositories for logankilpatrick, page 4...
Fetching repositories for khan4019, page 1...
Fetching repositories for khan4019, page 2...
Fetching reposit

In [10]:
import pandas as pd

# Load the CSV files
users_df = pd.read_csv("users.csv")
repos_df = pd.read_csv("repositories.csv")

# Display the first few rows to understand the structure
print(users_df.head())
print(repos_df.head())


         login  name  company  location  email  hireable  bio  public_repos  \
0     cassidoo   NaN      NaN       NaN    NaN       NaN  NaN             0   
1     felangel   NaN      NaN       NaN    NaN       NaN  NaN             0   
2       dabeaz   NaN      NaN       NaN    NaN       NaN  NaN             0   
3  sstephenson   NaN      NaN       NaN    NaN       NaN  NaN             0   
4  mattgodbolt   NaN      NaN       NaN    NaN       NaN  NaN             0   

   followers  following  created_at  
0          0          0         NaN  
1          0          0         NaN  
2          0          0         NaN  
3          0          0         NaN  
4          0          0         NaN  
      login           full_name            created_at  stargazers_count  \
0  cassidoo  cassidoo/accordion  2020-01-23T13:19:32Z                17   
1  cassidoo      cassidoo/agagd  2023-11-20T05:21:51Z                 1   
2  cassidoo        cassidoo/ama  2020-06-09T02:22:18Z                62 

In [3]:
top_5_followers = users_df.sort_values(by='followers', ascending=False).head(5)
print(",".join(top_5_followers['login']))


cassidoo,scollis,carolineartz,PedroLopes,zekenie


In [4]:
earliest_users = users_df.sort_values(by='created_at').head(5)
print(",".join(earliest_users['login']))


cassidoo,felangel,dabeaz,sstephenson,mattgodbolt


In [5]:
licenses = repos_df['license_name'].dropna()
top_3_licenses = licenses.value_counts().head(3).index
print(",".join(top_3_licenses))


mit,gpl-3.0,other


In [11]:
#
most_common_company = cleaned_companies.value_counts().idxmax()
print(f"The most common company is: {most_common_company}")


The most common company is: 


In [12]:
most_popular_language = repos_df['language'].mode()[0]
print(most_popular_language)


JavaScript


In [13]:
users_after_2020 = users_df[users_df['created_at'] >= '2020-01-01']
repos_after_2020 = repos_df[repos_df['login'].isin(users_after_2020['login'])]
second_popular_language = repos_after_2020['language'].value_counts().index[1]
print(second_popular_language)


TypeError: Invalid comparison between dtype=float64 and str

In [14]:
avg_stars = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
print(avg_stars)


Vim Script


In [15]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leader_strength = users_df.sort_values(by='leader_strength', ascending=False).head(5)
print(",".join(top_5_leader_strength['login']))


cassidoo,scollis,carolineartz,PedroLopes,zekenie


In [16]:
correlation = users_df['followers'].corr(users_df['public_repos'])
print(f"{correlation:.3f}")


nan


  c /= stddev[:, None]


In [17]:
from scipy.stats import linregress

slope, intercept, r_value, p_value, std_err = linregress(users_df['public_repos'], users_df['followers'])
print(f"{slope:.3f}")


ValueError: Cannot calculate a linear regression if all x values are identical

In [18]:
projects_wiki_corr = repos_df['has_projects'].corr(repos_df['has_wiki'])
print(f"{projects_wiki_corr:.3f}")


0.285


In [19]:
hireable_following = users_df[users_df['hireable'] == "true"]['following'].mean()
non_hireable_following = users_df[users_df['hireable'] != "true"]['following'].mean()
print(f"{(hireable_following - non_hireable_following):.3f}")


nan


In [20]:
users_df['bio_word_count'] = users_df['bio'].fillna("").apply(lambda x: len(x.split()))
bio_slope, _, _, _, _ = linregress(users_df['bio_word_count'], users_df['followers'])
print(f"{bio_slope:.3f}")


ValueError: Cannot calculate a linear regression if all x values are identical

In [21]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['weekday'] = repos_df['created_at'].dt.weekday  # 5 = Saturday, 6 = Sunday
weekend_repos = repos_df[repos_df['weekday'].isin([5, 6])]
most_repos_weekend_user = weekend_repos['login'].value_counts().head(5)
print(",".join(most_repos_weekend_user.index))


marwahaha,austinsonger,eddelbuettel,erichilarysmithsr,yyolk


In [22]:
hireable_with_email = users_df[(users_df['hireable'] == "true") & (users_df['email'].notna())].shape[0]
non_hireable_with_email = users_df[(users_df['hireable'] != "true") & (users_df['email'].notna())].shape[0]

hireable_fraction = hireable_with_email / users_df[users_df['hireable'] == "true"].shape[0]
non_hireable_fraction = non_hireable_with_email / users_df[users_df['hireable'] != "true"].shape[0]
print(f"{(hireable_fraction - non_hireable_fraction):.3f}")


ZeroDivisionError: division by zero

In [23]:
def extract_last_name(name):
    if pd.isna(name) or not name.strip():
        return None
    return name.strip().split()[-1]

users_df['last_name'] = users_df['name'].apply(extract_last_name)
most_common_surname = users_df['last_name'].value_counts().head(1).index[0]
print(most_common_surname)


IndexError: index 0 is out of bounds for axis 0 with size 0