In [None]:
import requests
import pandas as pd
import time
import statsmodels.api as sm

token = "*****"
headers = {'Authorization': f'token {token}'}

In [None]:
def clean_company_name(company):
    if company:
        company = company.strip()
        if company.startswith("@"):
            company = company[1:]
        return company.upper()
    return ""

def fetch_seattle_users(min_followers=200):
    users = []
    url = "https://api.github.com/search/users"
    page = 1

   
    while len(users) < 518: 
        params = {
            "q": f"location:Seattle followers:>{min_followers}",
            "per_page": 100,
            "page": page
        }
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code != 200:
            print(f"Error: {response.json()}")
            break
        
        data = response.json()
        for user in data.get('items', []):
            user_detail = requests.get(user['url'], headers=headers).json()
            if user_detail.get("followers", 0) > min_followers:
                users.append({
                    "login": user_detail.get("login", ""),
                    "name": user_detail.get("name", ""),
                    "company": clean_company_name(user_detail.get("company", "")),
                    "location": user_detail.get("location", ""),
                    "email": user_detail.get("email", ""),
                    "hireable": user_detail.get("hireable", ""),
                    "bio": user_detail.get("bio", ""),
                    "public_repos": user_detail.get("public_repos", 0),
                    "followers": user_detail.get("followers", 0),
                    "following": user_detail.get("following", 0),
                    "created_at": user_detail.get("created_at", "")
                })
    
        if len(data['items']) == 0:
            break

        page += 1
        time.sleep(1)
    
    return users[:518]

users_data = fetch_seattle_users()
users_df = pd.DataFrame(users_data)
users_df.to_csv("users.csv", index=False)


In [None]:
def fetch_user_repositories(login):
    repos = []
    url = f"https://api.github.com/users/{login}/repos"
    page = 1

    while len(repos) < 500:
        params = {
            "per_page": 100,
            "page": page,
            "sort": "pushed"
        }
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code != 200:
            print(f"Error fetching repos for {login}: {response.json()}")
            break
        
        data = response.json()
        if len(data) == 0:
            break  
        
        for repo in data:
            repos.append({
                "login": login,
                "full_name": repo.get("full_name", ""),
                "created_at": repo.get("created_at", ""),
                "stargazers_count": repo.get("stargazers_count", 0),
                "watchers_count": repo.get("watchers_count", 0),
                "language": repo.get("language", ""),
                "has_projects": repo.get("has_projects", False),
                "has_wiki": repo.get("has_wiki", False),
                "license_name": repo.get("license", {}).get("key", "") if repo.get("license") else ""
            })
        page += 1
        time.sleep(1) 
    
    return repos[:500] 

users_df = pd.read_csv("users.csv")
all_repositories = []

for login in users_df['login']:
    user_repos = fetch_user_repositories(login)
    all_repositories.extend(user_repos)
    print(f"Fetched {len(user_repos)} repositories for user {login}")

repositories_df = pd.DataFrame(all_repositories)
repositories_df.to_csv("repositories.csv", index=False)

In [None]:
users_df = pd.read_csv("users.csv")
repos_df = pd.read_csv("repositories.csv")

# Question 1
top_followers = users_df.sort_values(by="followers", ascending=False).head(5)
top_followers_logins = ", ".join(top_followers["login"].tolist())
print("Top 5 users by followers:", top_followers_logins)

# Question 2
earliest_users = users_df.sort_values(by="created_at").head(5)
earliest_users_logins = ", ".join(earliest_users["login"].tolist())
print("Earliest registered users:", earliest_users_logins)

# Question 3
top_licenses = repos_df["license_name"].value_counts().head(3).index.tolist()
print("Top 3 licenses:", ", ".join(top_licenses))

# Question 4
top_company = users_df["company"].mode()[0]
print("Most common company:", top_company)

# Question 5
top_language = repos_df["language"].mode()[0]
print("Most popular language:", top_language)

# Question 6
recent_users = users_df[users_df["created_at"] > "2020-01-01"]["login"]
recent_repos = repos_df[repos_df["login"].isin(recent_users)]
second_language = recent_repos["language"].value_counts().index[1]
print("Second most popular language (after 2020):", second_language)

# Question 7
language_stars = repos_df.groupby("language")["stargazers_count"].mean()
top_star_language = language_stars.idxmax()
print("Language with highest average stars:", top_star_language)

# Question 8
users_df["leader_strength"] = users_df["followers"] / (1 + users_df["following"])
top_leader_strength = users_df.sort_values(by="leader_strength", ascending=False).head(5)["login"]
print("Top 5 by leader strength:", ", ".join(top_leader_strength))

# Question 9
correlation_followers_repos = users_df["followers"].corr(users_df["public_repos"])
print("Correlation between followers and public repos:", round(correlation_followers_repos, 3))

# Question 10
from scipy.stats import linregress
slope, _, _, _, _ = linregress(users_df["public_repos"], users_df["followers"])
print("Followers per additional repo (slope):", round(slope, 3))

# Question 11
correlation_projects_wiki = repos_df["has_projects"].corr(repos_df["has_wiki"])
print("Correlation between projects and wiki enabled:", round(correlation_projects_wiki, 3))

# Question 12
avg_following_hireable = users_df[users_df["hireable"] == True]["following"].mean()
avg_following_nonhireable = users_df[users_df["hireable"] == False]["following"].mean()
following_difference = avg_following_hireable - avg_following_nonhireable
print("Difference in average following (hireable vs non-hireable):", round(following_difference, 3))

# Question 13
users_df['bio'] = users_df['bio'].fillna('')
users_df['followers'] = users_df['followers'].fillna(0)  
users_df['bio_word_count'] = users_df['bio'].apply(lambda x: len(x.split()))
filtered_df = users_df[users_df['bio_word_count'] > 0]
X = filtered_df['bio_word_count'] 
y = filtered_df['followers'] 
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
slope = model.params['bio_word_count']
print("Regression slope of followers on bio word count:", round(slope, 3))


# Question 14
repos_df["created_at"] = pd.to_datetime(repos_df["created_at"])
repos_df["is_weekend"] = repos_df["created_at"].dt.dayofweek >= 5
weekend_creators = repos_df[repos_df["is_weekend"]].groupby("login").size().nlargest(5).index.tolist()
print("Top 5 weekend creators:", ", ".join(weekend_creators))

# Question 15 -- however, this was solved in microsoft excel using simple formulae
def analyze_email_and_hireable(df):
    df['email'].fillna('', inplace=True)
    df['hireable'].fillna(False, inplace=True)
    df['hireable'] = df['hireable'].astype(bool)
    fraction_hireable_with_email = df[df['hireable']]['email'].astype(bool).mean()
    fraction_non_hireable_with_email = df[~df['hireable']]['email'].astype(bool).mean()
    return fraction_hireable_with_email, fraction_non_hireable_with_email
df = pd.read_csv('users.csv')
fraction_hireable, fraction_non_hireable = analyze_email_and_hireable(df)
print("Fraction of hireable users with email:", fraction_hireable)
print("Fraction of non-hireable users with email:", fraction_non_hireable)
print("Difference in email sharing (hireable vs non-hireable):", fraction_hireable - fraction_non_hireable)

# Question 16
users_df['surname'] = users_df['name'].dropna().apply(lambda x: x.strip().split()[-1])
surname_counts = users_df['surname'].value_counts()
max_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()
print("The most common surname is:", ','.join(sorted(most_common_surnames)))