# Script of Scraping

In [None]:
import datetime
import requests
import csv
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [3]:
# GitHub API base URL
base_url = 'https://api.github.com'

# GitHub username
username = 'Yours'

# GitHub access token 
access_token = 'Yours'

# User input for the start date
start_date_str = input("Enter the start date (YYYY-MM-DD): ")
start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()

# User input for the end date
end_date_str = input("Enter the end date (YYYY-MM-DD): ")
end_date = datetime.strptime(end_date_str, "%Y-%m-%d").date()

# User input for number of repositories per day
repositories_per_day = int(input("Enter the number of repositories per day: "))

# Output CSV file name
output_file = 'repositories.csv'

# Calculate the number of days to scrape
days_to_scrape = (end_date - start_date).days

# Initialize the repositories list
repositories = []

# Retry mechanism for API requests
retry_strategy = requests.packages.urllib3.util.retry.Retry(
    total=3,
    backoff_factor=0.5,
    status_forcelist=[429, 500, 502, 503, 504]
)
adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)

# Authenticate with access token
headers = {'Authorization': f'token {access_token}'}

# Iterate over each day
current_date = start_date
while current_date <= end_date:
    formatted_date = current_date.strftime('%Y-%m-%d')

    # Fetch repositories using pagination
    page = 1
    repositories_fetched = 0
    while repositories_fetched < repositories_per_day:
        # Create the API URL to fetch repositories created on the current day and specific page
        url = f'{base_url}/search/repositories?q=created:{formatted_date}&sort=stars&order=desc&per_page=100&page={page}'

        # Make the API request with retry logic
        response = http.get(url, headers=headers)
        data = response.json()

        if 'items' in data:
            # Extract repository information from the response
            for item in data['items']:
                repository = {
                    'name': item['name'],
                    'url': item['html_url'],
                    'description': item['description'],
                    'stars': item['stargazers_count'],
                    'created_at': item['created_at'],
                    'language': item.get('language', ''),
                    'forks': item['forks'],
                    'watchers': item['watchers'],
                    'open_issues': item['open_issues'],
                    'owner': item['owner']['login']
                }

                repositories.append(repository)
                repositories_fetched += 1

                if repositories_fetched >= repositories_per_day:
                    break

        page += 1

        if 'next' not in response.links:
            break

    current_date += timedelta(days=1)

# Write the repository data to a CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as file:
    fieldnames = ['name', 'url', 'description', 'stars', 'created_at', 'language', 'forks', 'watchers', 'open_issues', 'owner']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(repositories)

print('Repositories scraped and saved successfully!')

Enter the start date (YYYY-MM-DD): 2022-06-15
Enter the end date (YYYY-MM-DD): 2023-06-15
Enter the number of repositories per day: 400
Repositories scraped and saved successfully!


In [5]:
import pandas as pd

In [6]:
df=pd.read_csv('repositories.csv')
df.head()

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner
0,notepad--,https://github.com/cxasm/notepad--,一个支持windows/linux/mac的文本编辑器，目标是做中国人自己的编辑器，来自中国。,2890,2022-06-15T02:50:15Z,C++,114,2890,112,cxasm
1,pocket-casts-android,https://github.com/Automattic/pocket-casts-and...,Pocket Casts Android 🎧,2243,2022-06-15T12:41:42Z,Kotlin,165,2243,255,Automattic
2,nvim-basic-ide,https://github.com/LunarVim/nvim-basic-ide,🪨 This is my attempt at a basic stable startin...,1653,2022-06-15T23:06:53Z,Lua,437,1653,1,LunarVim
3,sismo-hub,https://github.com/sismo-core/sismo-hub,,891,2022-06-15T16:44:53Z,TypeScript,182,891,6,sismo-core
4,Antenna,https://github.com/wuba/Antenna,Antenna是58同城安全团队打造的一款辅助安全从业人员验证网络中多种漏洞是否存在以及可利...,675,2022-06-15T06:42:25Z,JavaScript,72,675,3,wuba


In [7]:
print(df.shape)

(146400, 10)
