# GitHub API Homework

This notebook demonstrates interaction with the GitHub API to:
1. Search repositories
2. Retrieve commits
3. Access repository contents

In [12]:
# Install required packages
!pip install pandas requests



In [13]:
import sys
import requests
import logging
import time
import json
import pandas as pd

# setup logging and logging formatting
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

## GitHub API Client Class Implementation

Here is the implementation of GitHub API client class that handles requests, authentication, rate limiting, and pagination.

In [14]:
class GitHubAPIClient:
    def __init__(self, auth_token):
        self.base_url = 'https://api.github.com'
        self.headers = {
            'Accept': 'application/vnd.github.v3+json',
            'Authorization': f'Bearer {auth_token}',
            'X-GitHub-Api-Version': '2022-11-28'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)

    @staticmethod
    def handle_rate_limits(response):
        """Handle rate limiting by checking remaining requests and waiting if rate limit is exceeded"""
        # remaining requests for a particular endpoint
        remaining = int(response.headers.get('X-RateLimit-Remaining', 0))
        if remaining == 0 and response.status_code == 403:
            reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
            sleep_time = max(reset_time - time.time(), 0)
            logger.warning(f'Rate limit nearly exceeded. Waiting for {sleep_time:.2f} seconds')
            time.sleep(sleep_time)

    def paginate_request(self, url, params=None, max_results=None):
        """Handle pagination for requests using links header from response"""
        all_results = []
        while url and (max_results is None or len(all_results) < max_results):
            logger.debug('Retrieving from ' + str(url))
            response = self.session.get(url, params=params)
            self.handle_rate_limits(response)

            # error handling for unsuccessful responses
            if response.status_code != 200:
                error_message = self.handle_error(response)
                logger.error(f'Error sending request to {url} with params {params}: {error_message}')
                sys.exit(1)

            data = response.json()
            if isinstance(data, list):
                all_results.extend(data)
            else:
                all_results.append(data)
            # stop if max_results is reached
            if max_results is not None and len(all_results) >= max_results:
                break
            # get next page url in links header
            url = response.links.get('next', {}).get('url')
            params = None
        return all_results[:max_results] if max_results else all_results

    def search_repositories(self, query, sort='stars', order='desc', per_page=100, max_results=None):
        """Search for repositories which match the search"""
        url = f'{self.base_url}/search/repositories'
        params = {
            'q': query,
            'sort': sort,
            'order': order,
            'per_page': per_page
        }
        return self.paginate_request(url, params, max_results=max_results)

    def get_repository_commits(self, owner, repo, per_page=100, max_results=None):
        """Get commits from a repository"""
        url = f'{self.base_url}/repos/{owner}/{repo}/commits'
        params = {
            'per_page': per_page
        }
        return self.paginate_request(url, params, max_results=max_results)

    def get_repository_contents(self, owner, repo, path=''):
        """Get contents of a repository at a specific path."""
        url = f'{self.base_url}/repos/{owner}/{repo}/contents/{path}'
        return self.paginate_request(url)

    @staticmethod
    def handle_error(response):
        """Handle various API error responses."""
        error_handlers = {
            401: 'Authentication error. Please check your token.',
            403: 'Rate limit exceeded or permission denied.',
            404: 'Resource not found. Verify the URL.',
            422: 'Validation failed. Parameters are incorrect.'
        }
        try:
            error_details = response.json().get('message', 'No additional details provided.')
        except ValueError:  # cases where the response body isn't JSON
            error_details = 'Response body is not valid JSON.'

        return error_handlers.get(response.status_code,
                                  f'Unknown error: {response.status_code}. Error details: {error_details}')

## Helper Functions for Data Display

These functions format and display the data we retrieve from the API using pandas.

In [25]:
def display_repos(repos):
    """Display repositories retrieved."""
    all_repo_data = []
    for repo in repos:
        if 'items' in repo:  # check if 'items' key exists
            # get relevant fields from the first 5 repositories
            repo_data = [
                {
                    'Repository': item['full_name'],
                    'Stars': item['stargazers_count'],
                    'Description': item['description']
                }
                for item in repo['items']
            ]
            all_repo_data.extend(repo_data)
        else:
            logger.error('No repositories found')
            sys.exit(1)
    df = pd.DataFrame(all_repo_data)
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 30):
        print('Sample repositories found:\n' + str(df.head(10)))


def display_commits(commits):
    """Display a sample of commits."""
    if not commits:
        logger.error('No commits found')
        return
    # get relevant fields
    commit_data = [
        {
            'SHA': commit['sha'],
            'Author': commit['commit']['author']['name'],
            'Message': commit['commit']['message'][:50],  # show the first 50 characters of the message
            'Date': commit['commit']['author']['date'],
            'URL': commit['commit']['url']
        }
        for commit in commits
    ]

    df = pd.DataFrame(commit_data)
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 30):
        print('Sample commits found:\n' + str(df.head()))


def display_contents(contents):
    """Display a sample of repository contents."""
    if not contents:
        logger.error('No repository contents found')
        return
    # get relevant fields
    contents_data = [
        {
            'Name': item['name'],
            'Type': item['type'],
            'Size': item['size'],
            'Path': item['path']
        }
        for item in contents
    ]

    df = pd.DataFrame(contents_data)
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 30):
        print('Sample contents found:\n' + str(df.head()))

## Testing the API Client

Now let's test our implementation. First, set up your Personal access token (classic) at https://github.com/settings/tokens.

In [20]:
# GitHub Personal Access Token
auth_token = 'paste_your_token'

# initialize client
client = GitHubAPIClient(auth_token)

### 1. Search Repositories
Let's search for 10 popular repositories with more than a thousand stars

In [26]:
# search repositories
max_repos = 10
repos = client.search_repositories('stars:>1000', sort='stars', max_results=max_repos)

# display results
display_repos(repos)


Sample repositories found:
                      Repository   Stars                    Description
0      freeCodeCamp/freeCodeCamp  406587  freeCodeCamp.org's open-so...
1  EbookFoundation/free-progr...  340919  :books: Freely available p...
2           sindresorhus/awesome  336135  😎 Awesome lists about all ...
3        public-apis/public-apis  319326  A collective list of free ...
4  codecrafters-io/build-your...  316001  Master programming by recr...
5  jwasham/coding-interview-u...  307541  A complete computer scienc...
6  kamranahmedse/developer-ro...  300256  Interactive roadmaps, guid...
7  donnemartin/system-design-...  278355  Learn how to design large-...
8                 996icu/996.ICU  269994  Repo for counting stars an...
9                 facebook/react  229948  The library for web and na...


### 2. Get Repository Commits
Let's get 10 commits from the first repository we found

In [27]:
# get sample repository details
sample_repo = repos[0]['items'][0]
owner = sample_repo['owner']['login']
repo_name = sample_repo['name']

# get commits
max_commits = 10
commits = client.get_repository_commits(owner, repo_name, max_results=max_commits)

# display results
commits_df = display_commits(commits)
commits_df

Sample commits found:
                             SHA                 Author  \
0  027473267a78ec12c4b30a3c8d...                  Zaira   
1  a334b21252eb1d43b15bce5946...                    Tom   
2  25de45da462403ea6a7dc28daf...  Oliver Eyton-Williams   
3  d80e3b0c8f98f74a5689d17b82...        Jessica Wilkins   
4  195e3e662b683f0051a37424e5...             Supravisor   

                         Message                  Date  \
0  feat(curriculum): adding c...  2024-12-05T16:06:40Z   
1  fix(curriculum): b1 englis...  2024-12-05T12:53:37Z   
2  fix(client): remove video ...  2024-12-05T08:55:00Z   
3  fix(curriculum): replace 1...  2024-12-04T19:06:20Z   
4  fix(curriculum): Typical W...  2024-12-04T18:39:02Z   

                             URL  
0  https://api.github.com/rep...  
1  https://api.github.com/rep...  
2  https://api.github.com/rep...  
3  https://api.github.com/rep...  
4  https://api.github.com/rep...  


### 3. Get Repository Contents
Let's retrieve the contents of the repository

In [28]:
# get contents
contents = client.get_repository_contents(owner, repo_name)

# display results
contents_df = display_contents(contents)
contents_df

Sample contents found:
             Name  Type  Size            Path
0   .dockerignore  file   177   .dockerignore
1   .editorconfig  file   241   .editorconfig
2   .eslintignore  file   269   .eslintignore
3  .eslintrc.json  file  3434  .eslintrc.json
4  .gitattributes  file   232  .gitattributes


### Save Results
Let's save all our results to a JSON file

In [29]:
# prepare results
results = {
    'repositories': repos,
    'sample_commits': commits,
    'contents': contents
}

# save to file
with open('github_api_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print('Results saved to github_api_results.json')

Results saved to github_api_results.json
