In [None]:
import requests
import pandas as pd
import time
import logging
from typing import List, Dict, Any

class GitHubScraper:
    def __init__(self, token: str):
        """
        Initialize the GitHub scraper with your API token.

        Args:
            token (str): GitHub Personal Access Token
        """
        self.headers = {
            'Authorization': f'token {token}',
            'Accept': 'application/vnd.github.v3+json'
        }
        self.base_url = 'https://api.github.com'

        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def _make_request(self, url: str, params: dict = None) -> Dict:
        """
        Make a request to the GitHub API with rate limit handling.
        """
        while True:
            response = requests.get(url, headers=self.headers, params=params)

            if response.status_code == 200:
                return response.json()
            elif response.status_code == 403:
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                sleep_time = max(reset_time - time.time(), 0) + 1
                self.logger.warning(f"Rate limit hit. Sleeping for {sleep_time} seconds")
                time.sleep(sleep_time)
            else:
                self.logger.error(f"Error {response.status_code}: {response.text}")
                response.raise_for_status()

    def clean_company_name(self, company: str) -> str:
        """
        Clean up company names according to specifications.
        """
        if not company:
            return ""

        # Strip whitespace and @ symbol
        cleaned = company.strip().lstrip('@')

        # Convert to uppercase
        return cleaned.upper()

    def search_users(self, location: str, min_followers: int) -> List[Dict]:
        """
        Search for GitHub users in a specific location with minimum followers.
        """
        users = []
        page = 1

        while True:
            self.logger.info(f"Fetching users page {page}")

            query = f"location:{location} followers:>={min_followers}"
            params = {
                'q': query,
                'per_page': 100,
                'page': page
            }

            url = f"{self.base_url}/search/users"
            response = self._make_request(url, params)

            if not response['items']:
                break

            for user in response['items']:
                user_data = self._make_request(user['url'])

                # Extract only the required fields with exact matching names
                cleaned_data = {
                    'login': user_data['login'],
                    'name': user_data['name'] if user_data['name'] else "",
                    'company': self.clean_company_name(user_data.get('company')),
                    'location': user_data['location'] if user_data['location'] else "",
                    'email': user_data['email'] if user_data['email'] else "",
                    'hireable': user_data['hireable'] if user_data['hireable'] is not None else False,
                    'bio': user_data['bio'] if user_data['bio'] else "",
                    'public_repos': user_data['public_repos'],
                    'followers': user_data['followers'],
                    'following': user_data['following'],
                    'created_at': user_data['created_at']
                }

                users.append(cleaned_data)

            page += 1

        return users

    def get_user_repositories(self, username: str, max_repos: int = 500) -> List[Dict]:
        """
        Get repositories for a specific user.
        """
        repos = []
        page = 1

        while len(repos) < max_repos:
            self.logger.info(f"Fetching repositories for {username}, page {page}")

            params = {
                'sort': 'pushed',
                'direction': 'desc',
                'per_page': 100,
                'page': page
            }

            url = f"{self.base_url}/users/{username}/repos"
            response = self._make_request(url, params)

            if not response:
                break

            for repo in response:
                # Extract only the required fields with exact matching names
                repo_data = {
                    'login': username,  # Adding owner's login as required
                    'full_name': repo['full_name'],
                    'created_at': repo['created_at'],
                    'stargazers_count': repo['stargazers_count'],
                    'watchers_count': repo['watchers_count'],
                    'language': repo['language'] if repo['language'] else "",
                    'has_projects': repo['has_projects'],
                    'has_wiki': repo['has_wiki'],
                    'license_name': repo['license']['key'] if repo.get('license') else ""
                }

                repos.append(repo_data)

            if len(response) < 100:
                break

            page += 1

        return repos[:max_repos]

def main():
    # Get GitHub token
    token = input("Enter your GitHub token: ").strip()
    if not token:
        print("Token is required. Exiting...")
        return

    # Initialize scraper
    scraper = GitHubScraper(token)

    # Search for users in Seattle with > 200 followers
    users = scraper.search_users(location='Seattle', min_followers=200)

    # Save users to CSV
    users_df = pd.DataFrame(users)
    users_df.to_csv('users.csv', index=False)

    # Get repositories for each user
    all_repos = []
    for user in users:
        repos = scraper.get_user_repositories(user['login'])
        all_repos.extend(repos)

    # Save repositories to CSV
    repos_df = pd.DataFrame(all_repos)
    repos_df.to_csv('repositories.csv', index=False)

    print(f"Scraped {len(users)} users and {len(all_repos)} repositories")

if __name__ == "__main__":
    main()

In [28]:
import csv

# Define the list to store users from Delhi
users_in_seattle = []

# Read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        location = row['location'].strip().lower()
        # Check if the user is from Delhi
        if 'seattle' in location:
            users_in_seattle.append({
                'login': row['login'],
                'followers': int(row['followers'])
            })

# Sort users based on followers in descending order
top_users = sorted(users_in_seattle, key=lambda x: x['followers'], reverse=True)

# Extract the top 5 user logins
top_5_logins = [user['login'] for user in top_users[:5]]

# Print the result as a comma-separated list
print(','.join(top_5_logins))


vczh,bradfitz,munificent,tenderlove,koush


In [29]:
# Convert created_at to datetime and sort in ascending order
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
earliest_5_users = users_df.sort_values(by='created_at').head(5)['login'].tolist()

print(','.join(earliest_5_users))


topfunky,nex3,beccasaurus,eric,grantr


In [30]:
repos_df = pd.read_csv('repositories.csv')

# Ignore missing licenses
license_counts = repos_df[repos_df['license_name'].notnull()]['license_name'].value_counts().head(3)

print(','.join(license_counts.index))


mit,apache-2.0,other


In [31]:
# Ignore missing or empty company fields
company_counts = users_df[users_df['company'].notnull()]['company'].value_counts()

# Get the company with the most users
majority_company = company_counts.idxmax()

print(majority_company)


MICROSOFT


In [32]:
# Ignore missing or empty languages
language_counts = repos_df[repos_df['language'].notnull()]['language'].value_counts()

# Get the most popular language
most_popular_language = language_counts.idxmax()

print(most_popular_language)


JavaScript


In [33]:
# Filter users who joined after 2020
users_after_2020 = users_df[users_df['created_at'] >= '2020-01-01']

# Get their logins
logins_after_2020 = users_after_2020['login'].tolist()

# Filter repositories for these users
repos_after_2020 = repos_df[repos_df['login'].isin(logins_after_2020)]

# Find the second most popular language
language_counts_after_2020 = repos_after_2020['language'].value_counts()

second_most_popular_language = language_counts_after_2020.index[1]  # Second most common language

print(second_most_popular_language)


Python


In [34]:
# Group by language and calculate average stars
avg_stars_by_language = repos_df.groupby('language')['stargazers_count'].mean()

# Get the language with the highest average stars
language_highest_avg_stars = avg_stars_by_language.idxmax()

print(language_highest_avg_stars)


Haml


In [35]:
# Calculate leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort users by leader_strength and get the top 5
top_5_leader_strength = users_df.sort_values(by='leader_strength', ascending=False).head(5)['login'].tolist()

print(','.join(top_5_leader_strength))


awslabs,mission-peace,karan,cmuratori,nex3


In [36]:
# Calculate correlation between followers and public repos
correlation = users_df['followers'].corr(users_df['public_repos'])

print(f'{correlation:.3f}')


0.200


In [37]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Prepare the data
X = users_df['public_repos'].values.reshape(-1, 1)
y = users_df['followers'].values

# Fit the model
reg_model = LinearRegression()
reg_model.fit(X, y)

# Slope of the regression (followers per public repo)
slope = reg_model.coef_[0]

print(f'{slope:.3f}')


2.435


In [49]:
# Calculate correlation between has_projects and has_wiki
projects_wiki_corr = repos_df['has_projects'].corr(repos_df['has_wiki'])

print(f'{projects_wiki_corr:.3f}')


0.311


In [47]:
import pandas as pd

def analyze_following_difference(users_csv_path='users.csv'):
    # Read the data
    df = pd.read_csv(users_csv_path)

    # Calculate average following for hireable users
    hireable_following = df[df['hireable'] == True]['following'].mean()

    # Calculate average following for non-hireable users
    non_hireable_following = df[df['hireable'] != True]['following'].mean()

    # Calculate the difference rounded to 3 decimal places
    difference = round(hireable_following - non_hireable_following, 3)

    # Print debug information
    print(f"Number of hireable users: {len(df[df['hireable'] == True])}")
    print(f"Number of non-hireable users: {len(df[df['hireable'] != True])}")
    print(f"Average following for hireable users: {hireable_following:.3f}")
    print(f"Average following for non-hireable users: {non_hireable_following:.3f}")

    return difference

# Calculate the difference
result = analyze_following_difference()
print(f"\nDifference in average following: {result:.3f}")

Number of hireable users: 117
Number of non-hireable users: 405
Average following for hireable users: 69.923
Average following for non-hireable users: 56.625

Difference in average following: 13.298


In [46]:
# Calculate bio length
users_df['bio_length'] = users_df['bio'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)

# Correlation between bio length and followers
bio_followers_corr = users_df['bio_length'].corr(users_df['followers'])

print(f'{bio_followers_corr:.3f}')


Number of users with bios: 351
Bio length range: 8 to 160
Followers range: 200 to 17512
R-squared: 0.001

Regression slope: 1.040


In [41]:
# Convert created_at to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter repositories created on weekends (Saturday or Sunday)
repos_on_weekends = repos_df[repos_df['created_at'].dt.weekday >= 5]

# Count repositories by user
weekend_repo_counts = repos_on_weekends['login'].value_counts().head(5)

print(','.join(weekend_repo_counts.index))


nolanlawson,homebysix,ingydotnet,anvaka,moznion


In [45]:
import pandas as pd

def analyze_email_sharing(users_csv_path='users.csv'):
    # Read the complete CSV file
    df = pd.read_csv(users_csv_path)

    # Convert email column to boolean (True if email exists, False if NaN or empty)
    df['has_email'] = df['email'].notna() & (df['email'] != '')

    # Calculate for hireable users
    hireable_mask = df['hireable'] == True
    if hireable_mask.any():
        hireable_email_fraction = df[hireable_mask]['has_email'].mean()
    else:
        hireable_email_fraction = 0

    # Calculate for non-hireable users
    non_hireable_mask = df['hireable'] != True
    if non_hireable_mask.any():
        non_hireable_email_fraction = df[non_hireable_mask]['has_email'].mean()
    else:
        non_hireable_email_fraction = 0

    # Calculate difference and round to 3 decimal places
    difference = round(hireable_email_fraction - non_hireable_email_fraction, 3)

    # Print debug information
    print(f"Total users: {len(df)}")
    print(f"Hireable users with email: {df[hireable_mask]['has_email'].sum()}/{hireable_mask.sum()}")
    print(f"Non-hireable users with email: {df[non_hireable_mask]['has_email'].sum()}/{non_hireable_mask.sum()}")
    print(f"Hireable fraction: {hireable_email_fraction:.3f}")
    print(f"Non-hireable fraction: {non_hireable_email_fraction:.3f}")

    return difference

# Read and analyze the complete dataset
result = analyze_email_sharing()
print(f"\nFinal result: {result:.3f}")

Total users: 522
Hireable users with email: 71/117
Non-hireable users with email: 210/405
Hireable fraction: 0.607
Non-hireable fraction: 0.519

Final result: 0.088


In [44]:
import csv
from collections import Counter

# Counter to store surname frequencies
surname_counter = Counter()

# Open the users.csv file and read data
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        name = row.get('name', '').strip()
        if name:  # Ignore missing names
            # Split the name by whitespace and get the last word as the surname
            surname = name.split()[-1]
            surname_counter[surname] += 1

# Find the maximum frequency of surnames
if surname_counter:
    max_count = max(surname_counter.values())
    # Get all surnames with the maximum frequency
    most_common_surnames = [surname for surname, count in surname_counter.items() if count == max_count]
    # Sort surnames alphabetically
    most_common_surnames.sort()
    # Output the result
    print(f"{', '.join(most_common_surnames)}: {max_count}")
else:
    print("No names found.")


Wang: 6
