In [5]:
import requests
import pandas as pd
import time
import logging
from typing import List, Dict, Any

class GitHubScraper:
    def __init__(self, token: str):
        """
        Initialize the GitHub scraper with your API token.
        
        Args:
            token (str): GitHub Personal Access Token
        """
        self.headers = {
            'Authorization': f'token {token}',
            'Accept': 'application/vnd.github.v3+json'
        }
        self.base_url = 'https://api.github.com'
        
        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def _make_request(self, url: str, params: dict = None) -> Dict:
        """
        Make a request to the GitHub API with rate limit handling.
        """
        while True:
            response = requests.get(url, headers=self.headers, params=params)
            
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 403:
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                sleep_time = max(reset_time - time.time(), 0) + 1
                self.logger.warning(f"Rate limit hit. Sleeping for {sleep_time} seconds")
                time.sleep(sleep_time)
            else:
                self.logger.error(f"Error {response.status_code}: {response.text}")
                response.raise_for_status()

    def clean_company_name(self, company: str) -> str:
        """
        Clean up company names according to specifications.
        """
        if not company:
            return ""
        
        # Strip whitespace and @ symbol
        cleaned = company.strip().lstrip('@')
        
        # Convert to uppercase
        return cleaned.upper()

    def search_users(self, location: str, min_followers: int) -> List[Dict]:
        """
        Search for GitHub users in a specific location with minimum followers.
        """
        users = []
        page = 1
        
        while True:
            self.logger.info(f"Fetching users page {page}")
            
            query = f"location:{location} followers:>={min_followers}"
            params = {
                'q': query,
                'per_page': 100,
                'page': page
            }
            
            url = f"{self.base_url}/search/users"
            response = self._make_request(url, params)
            
            if not response['items']:
                break
                
            for user in response['items']:
                user_data = self._make_request(user['url'])
                
                # Extract only the required fields with exact matching names
                cleaned_data = {
                    'login': user_data['login'],
                    'name': user_data['name'] if user_data['name'] else "",
                    'company': self.clean_company_name(user_data.get('company')),
                    'location': user_data['location'] if user_data['location'] else "",
                    'email': user_data['email'] if user_data['email'] else "",
                    'hireable': user_data['hireable'] if user_data['hireable'] is not None else False,
                    'bio': user_data['bio'] if user_data['bio'] else "",
                    'public_repos': user_data['public_repos'],
                    'followers': user_data['followers'],
                    'following': user_data['following'],
                    'created_at': user_data['created_at']
                }
                
                users.append(cleaned_data)
                
            page += 1
            
        return users

    def get_user_repositories(self, username: str, max_repos: int = 500) -> List[Dict]:
        """
        Get repositories for a specific user.
        """
        repos = []
        page = 1
        
        while len(repos) < max_repos:
            self.logger.info(f"Fetching repositories for {username}, page {page}")
            
            params = {
                'sort': 'pushed',
                'direction': 'desc',
                'per_page': 100,
                'page': page
            }
            
            url = f"{self.base_url}/users/{username}/repos"
            response = self._make_request(url, params)
            
            if not response:
                break
                
            for repo in response:
                # Extract only the required fields with exact matching names
                repo_data = {
                    'login': username,  # Adding owner's login as required
                    'full_name': repo['full_name'],
                    'created_at': repo['created_at'],
                    'stargazers_count': repo['stargazers_count'],
                    'watchers_count': repo['watchers_count'],
                    'language': repo['language'] if repo['language'] else "",
                    'has_projects': repo['has_projects'],
                    'has_wiki': repo['has_wiki'],
                    'license_name': repo['license']['key'] if repo.get('license') else ""
                }
                
                repos.append(repo_data)
                
            if len(response) < 100:
                break
                
            page += 1
            
        return repos[:max_repos]

def main():
    # Get GitHub token
    token = input("Enter your GitHub token: ").strip()
    if not token:
        print("Token is required. Exiting...")
        return

    # Initialize scraper
    scraper = GitHubScraper(token)
    
    # Search for users in Delhi with >100 followers
    users = scraper.search_users(location='melbourne', min_followers=100)
    
    # Save users to CSV
    users_df = pd.DataFrame(users)
    users_df.to_csv('users.csv', index=False)
    
    # Get repositories for each user
    all_repos = []
    for user in users:
        repos = scraper.get_user_repositories(user['login'])
        all_repos.extend(repos)
    
    # Save repositories to CSV
    repos_df = pd.DataFrame(all_repos)
    repos_df.to_csv('repositories.csv', index=False)
    
    print(f"Scraped {len(users)} users and {len(all_repos)} repositories")
    
    # Create README.md
    with open('README.md', 'w') as f:
        f.write(f"""# GitHub Users in melbourne

This repository contains data about GitHub users in melbourne with over 100 followers and their repositories.

## Files

1. `users.csv`: Contains information about {len(users)} GitHub users in melbourne with over 100 followers
2. `repositories.csv`: Contains information about {len(all_repos)} public repositories from these users
3. `gitscrap.py`: Python script used to collect this data

## Data Collection

- Data collected using GitHub API
- Date of collection: {time.strftime('%Y-%m-%d')}
- Only included users with 100+ followers
- Up to 500 most recently pushed repositories per user
""")

if __name__ == "__main__":
    main()

Enter your GitHub token:  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


2024-10-30 15:42:48,589 - INFO - Fetching users page 1
2024-10-30 15:44:10,326 - INFO - Fetching users page 2
2024-10-30 15:45:31,300 - INFO - Fetching users page 3
2024-10-30 15:46:54,275 - INFO - Fetching users page 4
2024-10-30 15:47:25,423 - INFO - Fetching users page 5
2024-10-30 15:47:26,260 - INFO - Fetching repositories for mosh-hamedani, page 1
2024-10-30 15:47:27,212 - INFO - Fetching repositories for TheCherno, page 1
2024-10-30 15:47:28,314 - INFO - Fetching repositories for haileys, page 1
2024-10-30 15:47:29,884 - INFO - Fetching repositories for haileys, page 2
2024-10-30 15:47:31,796 - INFO - Fetching repositories for haileys, page 3
2024-10-30 15:47:33,216 - INFO - Fetching repositories for haileys, page 4
2024-10-30 15:47:34,659 - INFO - Fetching repositories for rstacruz, page 1
2024-10-30 15:47:36,603 - INFO - Fetching repositories for rstacruz, page 2
2024-10-30 15:47:39,087 - INFO - Fetching repositories for rstacruz, page 3
2024-10-30 15:47:40,519 - INFO - Fetchi

Scraped 337 users and 29237 repositories


In [1]:
import requests

GITHUB_TOKEN = "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"  
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

def get_top_users_in_melbourne():
    users = []
    query = "location:melbourne+followers:>100"
    page = 1
    per_page = 100

    while True:
        url = f"https://api.github.com/search/users?q={query}&per_page={per_page}&page={page}"
        response = requests.get(url, headers=HEADERS)
        
        if response.status_code != 200:
            print("Error fetching data:", response.json())
            break

        data = response.json()
        users.extend(data['items'])

        if len(data['items']) < per_page:
            break

        page += 1

    
    detailed_users = []
    for user in users:
        user_detail_url = user['url']
        user_detail_response = requests.get(user_detail_url, headers=HEADERS)
        if user_detail_response.status_code == 200:
            user_detail = user_detail_response.json()
            detailed_users.append({
                'login': user_detail['login'],
                'followers': user_detail['followers']
            })
        else:
            print(f"Error fetching user details for {user['login']}: {user_detail_response.json()}")

    # Sort users by followers count in descending order and get the top 5
    top_users = sorted(detailed_users, key=lambda x: x['followers'], reverse=True)[:5]
    return [user['login'] for user in top_users]

# Main process
if __name__ == "__main__":
    top_users = get_top_users_in_melbourne()
    print(", ".join(top_users))


mosh-hamedani, TheCherno, haileys, rstacruz, jesseduffield


In [3]:
#q2.
import requests
from datetime import datetime

GITHUB_TOKEN = "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"  
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

def get_earliest_users_in_melbourne():
    users = []
    query = "location:melbourne+followers:>100"
    page = 1
    per_page = 100

    while True:
        url = f"https://api.github.com/search/users?q={query}&per_page={per_page}&page={page}"
        response = requests.get(url, headers=HEADERS)
        
        if response.status_code != 200:
            print("Error fetching data:", response.json())
            break

        data = response.json()
        users.extend(data['items'])

        if len(data['items']) < per_page:
            break

        page += 1

  
    detailed_users = []
    for user in users:
        user_detail_url = user['url']
        user_detail_response = requests.get(user_detail_url, headers=HEADERS)
        if user_detail_response.status_code == 200:
            user_detail = user_detail_response.json()
            detailed_users.append({
                'login': user_detail['login'],
                'created_at': datetime.strptime(user_detail['created_at'], "%Y-%m-%dT%H:%M:%SZ")
            })
        else:
            print(f"Error fetching user details for {user['login']}: {user_detail_response.json()}")

    # Sort users by created_at date in ascending order and get the earliest 5
    earliest_users = sorted(detailed_users, key=lambda x: x['created_at'])[:5]
    return [user['login'] for user in earliest_users]

# Main process
if __name__ == "__main__":
    earliest_users = get_earliest_users_in_melbourne()
    print(", ".join(earliest_users))


toolmantim, crafterm, dgoodlad, Sutto, mdub


In [4]:
#q3.
import requests
from collections import Counter

GITHUB_TOKEN = "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"  
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

def get_users_in_melbourne():
    users = []
    query = "location:melbourne+followers:>100"
    page = 1
    per_page = 100

    while True:
        url = f"https://api.github.com/search/users?q={query}&per_page={per_page}&page={page}"
        response = requests.get(url, headers=HEADERS)

        if response.status_code != 200:
            print("Error fetching users:", response.json())
            break

        data = response.json()
        users.extend([user['login'] for user in data['items']])

        if len(data['items']) < per_page:
            break

        page += 1

    return users

def get_repositories_licenses(users):
    license_counter = Counter()

    for username in users:
        page = 1
        while True:
            url = f"https://api.github.com/users/{username}/repos?per_page=100&page={page}"
            response = requests.get(url, headers=HEADERS)
            if response.status_code != 200:
                print(f"Error fetching repos for {username}:", response.json())
                break

            repos = response.json()
            if not repos:
                break

           
            for repo in repos:
                license_name = repo['license']['key'] if repo.get('license') else None
                if license_name:
                    license_counter[license_name] += 1

            if len(repos) < 100:
                break

            page += 1

    
    top_licenses = [license for license, _ in license_counter.most_common(3)]
    return top_licenses


if __name__ == "__main__":
    users = get_users_in_melbourne()
    top_licenses = get_repositories_licenses(users)
    print(", ".join(top_licenses))


mit, other, apache-2.0


In [13]:
#q4import pandas as pd
from collections import Counter


users_df = pd.read_csv('users.csv')


company_counts = Counter(users_df['company'].dropna().str.strip().str.upper())

most_common_company = company_counts.most_common(1)


if most_common_company:
    print(f"The company with the majority of developers is: {most_common_company[0][0]}")
else:
    print("No company information available for majority analysis.")


The company with the majority of developers is: MONASH UNIVERSITY


In [7]:
#q5.
import pandas as pd
from collections import Counter


repos_df = pd.read_csv('repositories.csv')


language_counts = Counter(repos_df['language'].dropna())


most_common_language = language_counts.most_common(1)


if most_common_language:
    print(f"The most popular programming language is: {most_common_language[0][0]}")
else:
    print("No programming language information available for analysis.")


The most popular programming language is: JavaScript


In [8]:
#q6.
import pandas as pd
from collections import Counter


users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')


users_after_2020 = users_df[pd.to_datetime(users_df['created_at']) > '2020-01-01']


repos_after_2020 = repos_df[repos_df['login'].isin(users_after_2020['login'])]


language_counts = Counter(repos_after_2020['language'].dropna())


second_most_common_language = language_counts.most_common(2)


if len(second_most_common_language) > 1:
    print(f"The second most popular programming language is: {second_most_common_language[1][0]}")
else:
    print("Not enough data to determine the second most popular programming language.")


The second most popular programming language is: JavaScript


In [9]:
#q7.
import pandas as pd


repos_df = pd.read_csv('repositories.csv')

repos_df = repos_df.dropna(subset=['language', 'stargazers_count'])


average_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()


top_language = average_stars_per_language.idxmax()
top_average_stars = average_stars_per_language.max()

print(f"The language with the highest average stars per repository is: {top_language} with an average of {top_average_stars:.2f} stars.")


The language with the highest average stars per repository is: D with an average of 2521.00 stars.


In [10]:
#q8.
import pandas as pd

# Load the user data
users_df = pd.read_csv('users.csv')


users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])


top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)


top_leader_logins = top_leaders['login'].tolist()


print(", ".join(top_leader_logins))


mosh-hamedani, binarythistle, TheCherno, TuPayChain, rogerclarkmelbourne


In [12]:
#q9.
import pandas as pd


users_df = pd.read_csv('users.csv')


correlation = users_df['followers'].corr(users_df['public_repos'])


print(f"The correlation between the number of followers and public repositories is: {correlation:.3f}")


The correlation between the number of followers and public repositories is: 0.189


In [13]:
#q10.
import pandas as pd
import statsmodels.api as sm


users_df = pd.read_csv('users.csv')

X = users_df['public_repos']
y = users_df['followers']


X = sm.add_constant(X)


model = sm.OLS(y, X).fit()


summary = model.summary()


followers_per_repo = model.params['public_repos']
print(f"Estimated additional followers per additional public repository: {followers_per_repo:.3f}")


print(summary)


Estimated additional followers per additional public repository: 2.257
                            OLS Regression Results                            
Dep. Variable:              followers   R-squared:                       0.036
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     12.46
Date:                Wed, 30 Oct 2024   Prob (F-statistic):           0.000475
Time:                        21:11:40   Log-Likelihood:                -2854.4
No. Observations:                 337   AIC:                             5713.
Df Residuals:                     335   BIC:                             5720.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------

In [34]:
#q11.
import pandas as pd


repos = pd.read_csv('repositories.csv')


repos['has_projects'] = repos['has_projects'].astype(int)
repos['has_wiki'] = repos['has_wiki'].astype(int)


correlation = repos['has_projects'].corr(repos['has_wiki'])


print(f"Correlation between having projects and having wiki: {correlation:.3f}")


Correlation between having projects and having wiki: 0.379


In [42]:
#q12.
import pandas as pd


users = pd.read_csv('users.csv')


average_hireable_following = users.loc[users['hireable'] == True, 'following'].mean()


average_non_hireable_following = users.loc[users['hireable'] == False, 'following'].mean()


difference = average_hireable_following - average_non_hireable_following


print(f"Difference in average following: {difference:.3f}")


Difference in average following: -46.186


In [32]:
#q13.
import pandas as pd
import numpy as np
import statsmodels.api as sm


users = pd.read_csv('users.csv')


users_with_bios = users[users['bio'].notnull() & (users['bio'].str.strip() != '')]


users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


X = users_with_bios['bio_word_count']
y = users_with_bios['followers']


X = sm.add_constant(X)


model = sm.OLS(y, X).fit()


slope = model.params['bio_word_count']


print(f"Slope of followers on bio word count: {slope:.3f}")


Slope of followers on bio word count: 7.708


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


In [23]:
#14.
import pandas as pd


repos = pd.read_csv('repositories.csv')


repos['created_at'] = pd.to_datetime(repos['created_at'])


repos['day_of_week'] = repos['created_at'].dt.dayofweek
weekend_repos = repos[repos['day_of_week'].isin([5, 6])]

user_repo_counts = weekend_repos['login'].value_counts()


top_users = user_repo_counts.head(5).index.tolist()


print(", ".join(top_users))


wolfeidau, karkranikhil, roachhd, plutext, rstacruz


In [2]:
#q15.
import pandas as pd


users = pd.read_csv('users.csv')


hireable_with_email = users[(users['hireable'] == True) & (users['email'].notna()) & (users['email'] != '')]
fraction_hireable_with_email = len(hireable_with_email) / len(users[users['hireable'] == True])


non_hireable_with_email = users[(users['hireable'] == False) & (users['email'].notna()) & (users['email'] != '')]
fraction_non_hireable_with_email = len(non_hireable_with_email) / len(users[users['hireable'] == False])

difference = fraction_hireable_with_email - fraction_non_hireable_with_email


print(f"Difference in email sharing (hireable - non-hireable): {difference:.3f}")


Difference in email sharing (hireable - non-hireable): 0.056


In [3]:
#q16.
import pandas as pd
from collections import Counter


users = pd.read_csv('users.csv')


surnames = users['name'].dropna().apply(lambda x: x.strip().split()[-1])


surname_counts = Counter(surnames)


max_count = max(surname_counts.values())


most_common_surnames = sorted([surname for surname, count in surname_counts.items() if count == max_count])


print(f"Most common surname(s): {', '.join(most_common_surnames)}")


Most common surname(s): Jackson, Wang
