In [2]:
# Cell 1: Import Libraries
import requests
import json
import time
from getpass import getpass

In [9]:
# Cell 2: Authentication
GITHUB_TOKEN = getpass('Enter your GitHub Personal Access Token: ')

Enter your GitHub Personal Access Token: ··········


In [10]:
# Cell 3: Set up headers
headers = {
    'Authorization': f'token {GITHUB_TOKEN}',
    'Accept': 'application/vnd.github+json',
    'X-GitHub-Api-Version': '2022-11-28'
}

In [11]:
# Cell 4: Function to search repositories
def search_repositories(query, per_page=30, page=1):
    url = 'https://api.github.com/search/repositories'
    params = {
        'q': query,
        'per_page': per_page,
        'page': page
    }
    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()
    return response.json()


In [12]:
# Cell 5: Test search_repositories function
query = 'machine learning'
result = search_repositories(query)
print(json.dumps(result, indent=2))


{
  "total_count": 715589,
  "incomplete_results": false,
  "items": [
    {
      "id": 21872392,
      "node_id": "MDEwOlJlcG9zaXRvcnkyMTg3MjM5Mg==",
      "name": "awesome-machine-learning",
      "full_name": "josephmisiti/awesome-machine-learning",
      "private": false,
      "owner": {
        "login": "josephmisiti",
        "id": 246302,
        "node_id": "MDQ6VXNlcjI0NjMwMg==",
        "avatar_url": "https://avatars.githubusercontent.com/u/246302?v=4",
        "gravatar_id": "",
        "url": "https://api.github.com/users/josephmisiti",
        "html_url": "https://github.com/josephmisiti",
        "followers_url": "https://api.github.com/users/josephmisiti/followers",
        "following_url": "https://api.github.com/users/josephmisiti/following{/other_user}",
        "gists_url": "https://api.github.com/users/josephmisiti/gists{/gist_id}",
        "starred_url": "https://api.github.com/users/josephmisiti/starred{/owner}{/repo}",
        "subscriptions_url": "https://api.g

In [13]:
# Cell 6: Function to list commits
def list_commits(owner, repo, per_page=30, page=1):
    url = f'https://api.github.com/repos/{owner}/{repo}/commits'
    params = {
        'per_page': per_page,
        'page': page
    }
    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()
    return response.json()

In [14]:
# Cell 7: Test list_commits function
owner = 'octocat'  # Replace with any GitHub username
repo = 'Hello-World'  # Replace with any repository name
commits = list_commits(owner, repo)
print(json.dumps(commits, indent=2))

[
  {
    "sha": "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d",
    "node_id": "MDY6Q29tbWl0MTI5NjI2OTo3ZmQxYTYwYjAxZjkxYjMxNGY1OTk1NWE0ZTRkNGU4MGQ4ZWRmMTFk",
    "commit": {
      "author": {
        "name": "The Octocat",
        "email": "octocat@nowhere.com",
        "date": "2012-03-06T23:06:50Z"
      },
      "committer": {
        "name": "The Octocat",
        "email": "octocat@nowhere.com",
        "date": "2012-03-06T23:06:50Z"
      },
      "message": "Merge pull request #6 from Spaceghost/patch-1\n\nNew line at end of file.",
      "tree": {
        "sha": "b4eecafa9be2f2006ce1b709d6857b07069b4608",
        "url": "https://api.github.com/repos/octocat/Hello-World/git/trees/b4eecafa9be2f2006ce1b709d6857b07069b4608"
      },
      "url": "https://api.github.com/repos/octocat/Hello-World/git/commits/7fd1a60b01f91b314f59955a4e4d4e80d8edf11d",
      "comment_count": 86,
      "verification": {
        "verified": false,
        "reason": "unsigned",
        "signature": null,
   

In [15]:
# Cell 8: Function to get repository contents
def get_repository_contents(owner, repo, path=''):
    url = f'https://api.github.com/repos/{owner}/{repo}/contents/{path}'
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.json()

In [16]:
# Cell 9: Test get_repository_contents function
owner = 'octocat'  # Replace with any GitHub username
repo = 'Hello-World'  # Replace with any repository name
contents = get_repository_contents(owner, repo)
print(json.dumps(contents, indent=2))

[
  {
    "name": "README",
    "path": "README",
    "sha": "980a0d5f19a64b4b30a87d4206aade58726b60e3",
    "size": 13,
    "url": "https://api.github.com/repos/octocat/Hello-World/contents/README?ref=master",
    "html_url": "https://github.com/octocat/Hello-World/blob/master/README",
    "git_url": "https://api.github.com/repos/octocat/Hello-World/git/blobs/980a0d5f19a64b4b30a87d4206aade58726b60e3",
    "download_url": "https://raw.githubusercontent.com/octocat/Hello-World/master/README",
    "type": "file",
    "_links": {
      "self": "https://api.github.com/repos/octocat/Hello-World/contents/README?ref=master",
      "git": "https://api.github.com/repos/octocat/Hello-World/git/blobs/980a0d5f19a64b4b30a87d4206aade58726b60e3",
      "html": "https://github.com/octocat/Hello-World/blob/master/README"
    }
  }
]


In [17]:
# Cell 10: Function to fetch all pages
def fetch_all_pages(url, params=None):
    items = []
    while url:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        if 'items' in data:
            items.extend(data['items'])
        else:
            items.extend(data)

        if 'next' in response.links:
            url = response.links['next']['url']
            params = None
        else:
            url = None
    return items

In [18]:
# Cell 11: Test fetch_all_pages function
query = 'machine learning'
url = 'https://api.github.com/search/repositories'
params = {
    'q': query,
    'per_page': 100
}
all_repos = fetch_all_pages(url, params)
print(f'Total repositories fetched: {len(all_repos)}')

Total repositories fetched: 1000


In [19]:
# Cell 12: Function to check rate limit
def check_rate_limit():
    url = 'https://api.github.com/rate_limit'
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    data = response.json()
    remaining = data['resources']['core']['remaining']
    reset_time = data['resources']['core']['reset']
    reset_timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(reset_time))
    return remaining, reset_timestamp

In [20]:
# Cell 13: Test check_rate_limit function
remaining, reset_time = check_rate_limit()
print(f'Requests remaining: {remaining}')
print(f'Rate limit resets at: {reset_time}')

Requests remaining: 4994
Rate limit resets at: 2024-11-26 17:39:02


In [21]:
# Cell 14: Safe request function with error handling
def safe_request(url, params=None):
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response
    except requests.exceptions.HTTPError as http_err:
        status_code = response.status_code
        if status_code == 401:
            print('Error 401: Unauthorized. Check your authentication token.')
        elif status_code == 403:
            print('Error 403: Forbidden. You might have hit the rate limit.')
            remaining, reset_time = check_rate_limit()
            print(f'Requests remaining: {remaining}. Rate limit resets at: {reset_time}')
        elif status_code == 404:
            print('Error 404: Not Found. Check the URL and parameters.')
        else:
            print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'An error occurred: {err}')
    return None

In [22]:
# Cell 15: Test safe_request function
url = 'https://api.github.com/repos/octocat/Hello-World/commits'
response = safe_request(url)
if response:
    commits = response.json()
    print(json.dumps(commits, indent=2))
else:
    print('Failed to retrieve data.')

[
  {
    "sha": "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d",
    "node_id": "MDY6Q29tbWl0MTI5NjI2OTo3ZmQxYTYwYjAxZjkxYjMxNGY1OTk1NWE0ZTRkNGU4MGQ4ZWRmMTFk",
    "commit": {
      "author": {
        "name": "The Octocat",
        "email": "octocat@nowhere.com",
        "date": "2012-03-06T23:06:50Z"
      },
      "committer": {
        "name": "The Octocat",
        "email": "octocat@nowhere.com",
        "date": "2012-03-06T23:06:50Z"
      },
      "message": "Merge pull request #6 from Spaceghost/patch-1\n\nNew line at end of file.",
      "tree": {
        "sha": "b4eecafa9be2f2006ce1b709d6857b07069b4608",
        "url": "https://api.github.com/repos/octocat/Hello-World/git/trees/b4eecafa9be2f2006ce1b709d6857b07069b4608"
      },
      "url": "https://api.github.com/repos/octocat/Hello-World/git/commits/7fd1a60b01f91b314f59955a4e4d4e80d8edf11d",
      "comment_count": 86,
      "verification": {
        "verified": false,
        "reason": "unsigned",
        "signature": null,
   

In [23]:
# Cell 16: Complete data extraction example
def extract_data(query):
    remaining, reset_time = check_rate_limit()
    if remaining == 0:
        print(f'Rate limit exceeded. Waiting until {reset_time}.')
        return

    print(f"Searching repositories for query: '{query}'")
    url = 'https://api.github.com/search/repositories'
    params = {'q': query, 'per_page': 100}
    repositories = fetch_all_pages(url, params)
    print(f"Total repositories found: {len(repositories)}")

    for repo in repositories[:5]:
        owner = repo['owner']['login']
        repo_name = repo['name']
        print(f"\nRepository: {owner}/{repo_name}")

        print("Fetching commits...")
        commits_response = safe_request(f'https://api.github.com/repos/{owner}/{repo_name}/commits')
        if commits_response:
            commits = commits_response.json()
            print(f"Number of commits fetched: {len(commits)}")
        else:
            print("Failed to fetch commits.")

        print("Fetching repository contents...")
        contents_response = safe_request(f'https://api.github.com/repos/{owner}/{repo_name}/contents')
        if contents_response:
            contents = contents_response.json()
            print(f"Number of items in repository root: {len(contents)}")
        else:
            print("Failed to fetch contents.")

extract_data('machine learning')


Searching repositories for query: 'machine learning'
Total repositories found: 1000

Repository: josephmisiti/awesome-machine-learning
Fetching commits...
Number of commits fetched: 30
Fetching repository contents...
Number of items in repository root: 9

Repository: wepe/MachineLearning
Fetching commits...
Number of commits fetched: 30
Fetching repository contents...
Number of items in repository root: 12

Repository: Jack-Cherish/Machine-Learning
Fetching commits...
Number of commits fetched: 30
Fetching repository contents...
Number of items in repository root: 12

Repository: lawlite19/MachineLearning_Python
Fetching commits...
Number of commits fetched: 30
Fetching repository contents...
Number of items in repository root: 13

Repository: udacity/machine-learning
Fetching commits...
Number of commits fetched: 30
Fetching repository contents...
Number of items in repository root: 5
