In [2]:
import zipfile
import os

# Define the path to the zip file and extraction directory
zip_filename = 'Content.zip'  # Adjust the filename if different
extract_path = '/Users/artemdiachuk/Data-Source-API-Analyst-Test-2/'  # Adjust as needed

# Check if the zip file exists
if os.path.exists(zip_filename):
    print(f'Unzipping {zip_filename}...')
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print('Unzipping completed.')
else:
    print(f'{zip_filename} not found. Please ensure it is in the current directory.')


Unzipping Content.zip...
Unzipping completed.


In [3]:
import os

# Print the current working directory
current_directory = os.getcwd()
print(f'Current Working Directory: {current_directory}')


Current Working Directory: /Users/artemdiachuk/Data-Source-API-Analyst-Test-2


In [4]:
import sys
import os

# Define the base path to the Content directory
base_path = '/Users/artemdiachuk/Data-Source-API-Analyst-Test-2/Content'  # Adjust as needed

# Define paths for subdirectories
utils_path = os.path.join(base_path, 'utils')
extraction_path = os.path.join(base_path, 'extraction')
processing_path = os.path.join(base_path, 'processing')

# Add subdirectories to sys.path for module imports
sys.path.append(utils_path)
sys.path.append(extraction_path)
sys.path.append(processing_path)

print('Python path updated with Content subdirectories.')


Python path updated with Content subdirectories.


In [5]:
import requests
import json
import time
import os
from getpass import getpass

from authentication import get_github_token, create_headers
from error_handling import safe_request
from rate_limiting import check_rate_limit, wait_if_rate_limited
from data_extraction import search_repositories, list_commits, get_repository_contents, save_response
from pagination import fetch_all_pages
from data_processing import (
    load_raw_response,
    save_cleaned_data,
    clean_repository_data,
    clean_commit_data,
    clean_content_data
)

print('All modules imported successfully.')


All modules imported successfully.


In [6]:
directories = ['saved_responses', 'cleaned_data']

for directory in directories:
    dir_path = os.path.join(base_path, directory)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        print(f'Created directory: {dir_path}')
    else:
        print(f'Directory already exists: {dir_path}')


Directory already exists: /Users/artemdiachuk/Data-Source-API-Analyst-Test-2/Content/saved_responses
Created directory: /Users/artemdiachuk/Data-Source-API-Analyst-Test-2/Content/cleaned_data


In [7]:
GITHUB_TOKEN = get_github_token()

headers = create_headers(GITHUB_TOKEN)

print('Authentication headers created.')

Authentication headers created.


In [8]:
import pandas as pd

def save_to_csv(data, filename):
    """
    Save cleaned data to a CSV file.
    """
    df = pd.DataFrame(data)
    csv_path = os.path.join('cleaned_data', f'{filename}.csv')

    os.makedirs(os.path.dirname(csv_path), exist_ok=True)

    df.to_csv(csv_path, index=False)
    print(f'Data saved to {csv_path}')


In [9]:
def search_repositories_clean(query, per_page=30):
    """
    Search public repositories and return cleaned data.
    """
    print(f"Searching repositories for query: '{query}'")
    cleaned_repositories = search_repositories(query, headers, per_page=per_page)
    print(f"Total repositories found: {len(cleaned_repositories)}")

    save_cleaned_data(cleaned_repositories, 'cleaned_search_repositories')

    save_to_csv(cleaned_repositories, 'cleaned_search_repositories')

    return cleaned_repositories


In [10]:
query = 'machine learning'

cleaned_repos = search_repositories_clean(query)

print(json.dumps(cleaned_repos[:2], indent=2))


Searching repositories for query: 'machine learning'
Total repositories found: 30
Data saved to cleaned_data/cleaned_search_repositories.csv
[
  {
    "id": 21872392,
    "name": "awesome-machine-learning",
    "full_name": "josephmisiti/awesome-machine-learning",
    "html_url": "https://github.com/josephmisiti/awesome-machine-learning",
    "description": "A curated list of awesome Machine Learning frameworks, libraries and software.",
    "owner": "josephmisiti",
    "stargazers_count": 66127,
    "forks_count": 14656,
    "language": "Python"
  },
  {
    "id": 27595858,
    "name": "MachineLearning",
    "full_name": "wepe/MachineLearning",
    "html_url": "https://github.com/wepe/MachineLearning",
    "description": "Basic Machine Learning and Deep Learning",
    "owner": "wepe",
    "stargazers_count": 5253,
    "forks_count": 3171,
    "language": "Python"
  }
]


In [11]:
def list_commits_clean(owner, repo, per_page=30):
    """
    List commits for a repository and return cleaned data.
    """
    print(f"Listing commits for repository: {owner}/{repo}")
    cleaned_commits = list_commits(owner, repo, headers, per_page=per_page)
    print(f"Number of commits fetched: {len(cleaned_commits)}")

    save_cleaned_data(cleaned_commits, f'cleaned_commits_{owner}_{repo}')

    save_to_csv(cleaned_commits, f'cleaned_commits_{owner}_{repo}')

    return cleaned_commits


In [12]:
owner = 'Artem9908'
repo = 'Data-Source-API-Analyst-Test'

cleaned_commits = list_commits_clean(owner, repo)

print(json.dumps(cleaned_commits[:2], indent=2))


Listing commits for repository: Artem9908/Data-Source-API-Analyst-Test
Number of commits fetched: 5
Data saved to cleaned_data/cleaned_commits_Artem9908_Data-Source-API-Analyst-Test.csv
[
  {
    "sha": "07a1692fac98942890beb3658b49595f49c3265c",
    "author": "Artem9908",
    "date": "2024-11-27T12:42:13Z",
    "message": "Update README.md",
    "url": "https://github.com/Artem9908/Data-Source-API-Analyst-Test/commit/07a1692fac98942890beb3658b49595f49c3265c"
  },
  {
    "sha": "a71ccca9514c20e8628a95e6a88ae264f0835b21",
    "author": "Artem Diachuk",
    "date": "2024-11-26T17:42:19Z",
    "message": "Add content to Content folder files and update README",
    "url": "https://github.com/Artem9908/Data-Source-API-Analyst-Test/commit/a71ccca9514c20e8628a95e6a88ae264f0835b21"
  }
]


In [13]:
def get_repository_contents_clean(owner, repo, path=''):
    """
    Get repository contents and return cleaned data.
    """
    if path:
        print(f"Fetching contents for {owner}/{repo} at path: '{path}'")
    else:
        print(f"Fetching root contents for {owner}/{repo}")

    cleaned_contents = get_repository_contents(owner, repo, headers, path)
    print(f"Number of items fetched: {len(cleaned_contents)}")

    save_cleaned_data(cleaned_contents, f'cleaned_contents_{owner}_{repo}_{path.replace("/", "_")}')

    save_to_csv(cleaned_contents, f'cleaned_contents_{owner}_{repo}_{path.replace("/", "_")}')

    return cleaned_contents


In [14]:
owner = 'Artem9908'
repo = 'Data-Source-API-Analyst-Test'

path = ''

contents = get_repository_contents_clean(owner, repo, path)

print(json.dumps(contents[:2], indent=2))


Fetching root contents for Artem9908/Data-Source-API-Analyst-Test
Number of items fetched: 3
Data saved to cleaned_data/cleaned_contents_Artem9908_Data-Source-API-Analyst-Test_.csv
[
  {
    "name": "Content",
    "path": "Content",
    "type": "dir",
    "download_url": null
  },
  {
    "name": "Postman_Collection",
    "path": "Postman_Collection",
    "type": "dir",
    "download_url": null
  }
]


In [15]:
def check_rate_limit_status():
    remaining, reset_time = check_rate_limit(headers)
    reset_timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(reset_time))
    print(f'Requests remaining: {remaining}')
    print(f'Rate limit resets at: {reset_timestamp}')
    return remaining, reset_timestamp

In [16]:
remaining, reset_time = check_rate_limit_status()

Requests remaining: 4998
Rate limit resets at: 2024-11-28 14:57:36


In [17]:
url = 'https://api.github.com/repos/octocat/Hello-World/commits'
response = safe_request(url, headers)

if response:
    commits = response.json()
    print(json.dumps(commits[:2], indent=2))
else:
    print('Failed to retrieve data.')


[
  {
    "sha": "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d",
    "node_id": "MDY6Q29tbWl0MTI5NjI2OTo3ZmQxYTYwYjAxZjkxYjMxNGY1OTk1NWE0ZTRkNGU4MGQ4ZWRmMTFk",
    "commit": {
      "author": {
        "name": "The Octocat",
        "email": "octocat@nowhere.com",
        "date": "2012-03-06T23:06:50Z"
      },
      "committer": {
        "name": "The Octocat",
        "email": "octocat@nowhere.com",
        "date": "2012-03-06T23:06:50Z"
      },
      "message": "Merge pull request #6 from Spaceghost/patch-1\n\nNew line at end of file.",
      "tree": {
        "sha": "b4eecafa9be2f2006ce1b709d6857b07069b4608",
        "url": "https://api.github.com/repos/octocat/Hello-World/git/trees/b4eecafa9be2f2006ce1b709d6857b07069b4608"
      },
      "url": "https://api.github.com/repos/octocat/Hello-World/git/commits/7fd1a60b01f91b314f59955a4e4d4e80d8edf11d",
      "comment_count": 86,
      "verification": {
        "verified": false,
        "reason": "unsigned",
        "signature": null,
   

In [31]:
def extract_data(query, max_repos=5):
    """
    Complete data extraction process:
    1. Search repositories based on query.
    2. For each repository, fetch commits and contents.
    """
    remaining, reset_time = check_rate_limit(headers)
    if remaining == 0:
        print(f'Rate limit exceeded. Please wait until {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(reset_time))}.')
        return

    print(f"Searching repositories for query: '{query}'")
    cleaned_repositories = search_repositories_clean(query, per_page=100)
    print(f"Total repositories found: {len(cleaned_repositories)}")

    for repo in cleaned_repositories[:max_repos]:
        owner = repo['owner']
        repo_name = repo['name']
        print(f"\nRepository: {owner}/{repo_name}")

        print("Fetching commits...")
        commits = list_commits_clean(owner, repo_name)
        print(f"Number of commits fetched: {len(commits)}")

        print("Fetching repository contents...")
        contents = get_repository_contents_clean(owner, repo_name)
        print(f"Number of items in repository root: {len(contents)}")


In [32]:
extract_data('machine learning', max_repos=5)

In [33]:
remaining, reset_time = check_rate_limit_status()

print("Data extraction process completed.")