# Github Repositories Scraping 

In this project , 
we gonna use the github standard API to retrieve github repositories informations , in order to study and do some analysis for the devleoppment tred , and know the most used language in the last three years 

## Import The Necessary Libraries 

In [None]:
import pandas as pd 
import csv
import logging
import requests
import time
import signal
import sys
import os.path
import concurrent.futures
import threading
from datetime import datetime, timedelta
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from requests.exceptions import RetryError
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor


## Function To Fetch Github Repositories Using Github API

#### Divide date range function is used to divide the given date range into smaller intervals :


In [None]:
def divide_date_range(start_date, end_date, interval):

    intervals = []
    current_date = start_date

    while current_date < end_date:
        next_date = min(current_date + timedelta(days=interval), end_date)
        intervals.append((current_date, next_date))
        current_date = next_date + timedelta(days=1)

    return intervals

#### Fetch repositories function is used to 

In [None]:
def fetch_repositories(start_date, end_date, token, repositories_limit):

    logging.info("Fetch Repositories Called ")
    base_url = "https://api.github.com/search/repositories"
    headers = {"Authorization": f"Bearer {token}"}
    per_page = 100
    page = 1
    repositories = []
    repositories_count = 0


    # Configure the retry mechanism with exponential backoff
    
    retries = Retry(total = 5, backoff_factor = 0.2, status_forcelist = [429, 500, 502, 503, 504]) # Code Errors
    
    session = requests.Session()
    session.mount(base_url, HTTPAdapter(max_retries = retries))

    while repositories_count < repositories_limit:
        params = {
            "q": f"created:{start_date.date()}..{end_date.date()}",
            "sort": "stars",
            "order": "desc",
            "per_page": per_page,
            "page": page
        }

        try:
            response = session.get(base_url, headers=headers, params=params)
            status = response.raise_for_status()
            logging.info(f"Raised Status  = {status} ")

            data = response.json()
            items = data.get("items", [])

            repositories.extend(items)
            repositories_count += len(items)

            total_count = min(data.get("total_count", 0), repositories_limit)
            logging.info(f"Processed page {page}/{total_count // per_page + 1} | Retrieved repositories: {repositories_count}")

            if repositories_count >= repositories_limit or end_date <= start_date:
                logging.info(f"We Gonna Stop Now , Byy !!!!!")

                break

            page += 1

        except requests.exceptions.HTTPError as err:
            
            logging.error(f"Failed to retrieve repositories from page {page}: {err}")
            
            # Retry the request after a certain interval using exponential backoff
            time.sleep(retries.get_backoff_time())

        # Check rate limit and wait if necessary after reaching the repositories limit
        remaining_requests = int(response.headers.get("X-RateLimit-Remaining", 0))
        
        if remaining_requests == 0:
            reset_time = int(response.headers["X-RateLimit-Reset"])
            current_time = time.time()
            sleep_time = max(reset_time - current_time, 0) + 1  
            logging.info(f"Rate limit reached. Sleeping for {sleep_time} seconds...")
            time.sleep(sleep_time)

    return repositories


In [None]:
# Get repository information:
def get_repository_info(repository):
    repository_info = {
        "id": repository["id"],
        "url": repository["url"],
        "name": repository["name"],
        "owner": repository["owner"]["login"],
        "ownertype": repository["owner"]["type"],
        "created_at": repository["created_at"],
        "updated_at": repository["updated_at"],
        "pushed_at": repository["pushed_at"],
        "language": repository["language"],
        "has_issues": repository["has_issues"],
        "stargazers_count": repository["stargazers_count"],
        "open_issues_count": repository["open_issues_count"],
        "description": repository["description"],
        "archive_url": repository["archive_url"],
        "forks": repository["forks"],
        "topics": repository["topics"],
        "license": repository["license"],
        "allow_forking": repository["allow_forking"],
        "contributers_url": repository["contributors_url"],
    }
    return repository_info

In [None]:
# Save repositories to CSV file
def save_repositories_to_csv(repositories, filename):
    keys = [
        "id",
        "url",
        "name",
        "owner",
        "ownertype",
        "created_at",
        "updated_at",
        "pushed_at",
        "language",
        "has_issues",
        "stargazers_count",
        "open_issues_count",
        "description",
        "archive_url",
        "forks",
        "topics",
        "license",
        "allow_forking",
        "contributers_url",
    ]

    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()

        for repository in repositories:
            repository_info = get_repository_info(repository)
            writer.writerow(repository_info)

        logging.info(f"Repositories saved to {filename}")

#### Get repositories within date range is used to fetch repositories in a given date range :


In [None]:
def get_repositories_within_date_range(start_date, end_date, token, filename, interval):
        
    intervals = divide_date_range(start_date, end_date, interval)
    repositories = [] 
    repositories_limit = 950 # define a repo limit , in order to avoid the github api rate limit : 

    try:
        for idx, (start, end) in enumerate(intervals):
            logging.info(f"Processing interval {idx+1}/{len(intervals)}: {start.date()} to {end.date()} ---------------------------------------------------------Enjoy-----------------------------")
            
            interval_repositories = fetch_repositories(start, end, token, repositories_limit)
            repositories.extend(interval_repositories)

            if len(repositories) >= repositories_limit:
                logging.info(f"Reached the repositories limit {repositories_limit}. Sleeping for 20 seconds --------- !")
                time.sleep(20)

    except KeyboardInterrupt:
        logging.info("Keyboard interruption detected. Saving progress and exiting --------- !!")
        save_repositories_to_csv(repositories, filename)  # Save progress before exiting

    return repositories

#### Usage Function 

In [None]:
def main():
    # Configure logging
    logging.basicConfig(
        filename='repository_fetch.log',
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        filemode='w'  # Overwrite the log file each run
    )

    # Set the start and end dates for the range
    start_date = datetime(2020, 1, 1)
    end_date = datetime(2022, 5, 1)

    token = "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 "

    # Set the filename for saving the CSV file
    filename = "Raw_Repositories.csv"

    # Set the interval for fetching repositories (in days)
    interval = 30

    repositories = get_repositories_within_date_range(start_date, end_date, token, filename, interval)

    if repositories:
        save_repositories_to_csv(repositories, filename)
    else:
        logging.info("No repositories found within the specified date range. Exiting...")


if __name__ == "__main__":
    main()

# Get more repositories informations  

## Used Languages URL 

#### ThreadPoolExecuter

In [None]:
tokens = [
"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" , 
"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" ,
"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" ,
"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" ,
"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",
"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" ,
"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" ,
"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" ,
"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" ,
"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" ,
"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" ,
"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" 
]

def fetch_languages_url(repo_url, token):
    headers = {'Authorization': f'token {token}'}
    response = requests.get(repo_url, headers=headers)

    remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))
    reset_time = int(response.headers.get('X-RateLimit-Reset', 0))

    if remaining_requests == 0:
        remaining_requests = 0  # Reset remaining_requests to indicate rate limit reached
        reset_time = time.time() + 1  # Set reset_time to current time + 1 second to force retry

    if response.status_code == 200:
        repo_data = response.json()
        if 'languages_url' in repo_data:
            return repo_data['languages_url'], remaining_requests, reset_time

    return None, remaining_requests, reset_time

def update_csv_file(csv_file, output_csv_file):
    with open(csv_file, 'r') as file:
        print('Opening the input file...')
        reader = csv.DictReader(file)
        headers = reader.fieldnames + ['languages_url']
        rows = list(reader)

    with open(output_csv_file, 'w', newline='') as file:
        print('Opening the output file...')
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()

        retry_count = 0
        token_index = 0
        tokens_count = len(tokens)

        def process_row(row):
            nonlocal retry_count, token_index

            repo_url = row['url']
            token = tokens[token_index]

            while True:
                languages_url, remaining_requests, reset_time = fetch_languages_url(repo_url, token)

                if languages_url is not None or remaining_requests > 0:
                    break

                retry_count += 1
                if retry_count == tokens_count:
                    print('Rate limit exceeded for all tokens. Saving the retrieved info and exiting...')
                    return
                else:
                    token_index = (token_index + 1) % tokens_count
                    token = tokens[token_index]
                    print(f'Rate limit exceeded. Switching to the next token (Attempt {retry_count + 1})...')
                    print(f'Switching to token index: {token_index}')

            row['languages_url'] = languages_url
            writer.writerow(row)

            # Log progress
            repo_url = row['url']
            if languages_url is not None:
                print(f'Success: Scraped languages URL for repository: {repo_url}')
            else:
                print(f'Failure: Failed to scrape languages URL for repository: {repo_url}')

            if (index + 1) % 950 == 0:
                print('Sleeping for 1 minute...')
                time.sleep(60)  # Sleep for 1 minute after every 950 processed repos

        with ThreadPoolExecutor(max_workers=4) as executor:
            for index, row in enumerate(rows):
                executor.submit(process_row, row)

        print('All repositories processed successfully.')

# Example usage
input_csv_file = 'FirstData.csv'
output_csv_file = 'SecondData.csv'
update_csv_file(input_csv_file, output_csv_file)

#### Iterating Over Tokens

In [None]:
tokens = [
    "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",
    "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",
    "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"
]


def fetch_languages_url(repo_url , token):
    
    try:
        headers = {'Authorization': f'token {token}'}
        response = requests.get(repo_url, headers=headers)

        remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))

        if remaining_requests < 5:
            
            print(f'No more remaining_requests  : {remaining_requests} , we gonna change the token ' )
            response = requests.get(repo_url, headers=headers) # added yesterday
            remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))

        response = requests.get(repo_url, headers=headers)
        remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))

        if response.status_code == 200:
            repo_data = response.json()
            
            if 'languages_url' in repo_data:
                return repo_data['languages_url'], remaining_requests
            
            print(" The field is not found within the api provided information ")

        return None , remaining_requests
    
    except KeyboardInterrupt:
        
        print(" Keyorad Intteruption !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ")
        response = str(input("Do you want complete scraping , Y / n "))
        
        if response == 'Y' or 'y' or 'yes':
            print("The Programm Will Continue Scraping ------------> ")
            return None, remaining_requests

        else:
            return None , None
            systeme.exit(0)   

    except requests.exceptions.RequestException:
        print('Connection error occurred. Retrying after 10 minutes...')
        time.sleep(200)  # Wait for 3 minutes
        
        return fetch_languages_url(repo_url, token)


def update_csv_file(csv_file, output_csv_file, start_index):
    
    with open(csv_file, 'r') as file:
        print('Opening the input file in write mode  >>>>>>>>>>>>>>>>>>>>>>')
        reader = csv.DictReader(file)
        headers = reader.fieldnames + ['languages_url']
        rows = list(reader)

    with open(output_csv_file, 'a', newline='') as file:
        print('Opening the output file i append mode  >>>>>>>>>>>>>>>>>>>>>>')
        writer = csv.DictWriter(file, fieldnames = headers)

        retry_count = 0
        token_index = 0

        for index, row in enumerate(rows[start_index:], start = start_index):
            repo_url = row['url']
            token = tokens[token_index]

            languages_url, remaining_requests = fetch_languages_url(repo_url, token)

            row['languages_url'] = languages_url
            writer.writerow(row)

            # handling the rate limite probleme  
            
            if languages_url is not None:
                print(f'Success: Scraped languages URL for repository at index {index}')

            else:
                print(f'Failure: Failed to scrape languages URL for repository at index {index}')

            if (index + 1) % 950 == 0:
                print('Sleeping for 1 minute to avoid day rate limit issues  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
                time.sleep(60)
                
            if remaining_requests < 5:
                print(f'Rate limit exceeded. Switching to the next token (Attempt {retry_count + 1})...')
                token_index = (token_index + 1) % len(tokens)
                print(f'Switching to token index: {token_index}')

        else:
            print('All repositories processed successfully.')


input_csv_file = 'FirstData.csv'
output_csv_file = 'SecondData.csv'
start_index = int(input("Enter the index of the beginning: "))
update_csv_file(input_csv_file, output_csv_file, start_index)

## Used Languages Percentage 

#### ThreadPoolExceuter 

In [None]:
tokens = [
    "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",
    "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",
    "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",
    "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",
    "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",
    "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"
]


def fetch_languages(languages_url, token):
    try:
        headers = {'Authorization': f'token {token}'}
        response = requests.get(languages_url, headers=headers)

        remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))
        reset_time = int(response.headers.get('X-RateLimit-Reset', 0))

        if remaining_requests == 0:
            sleep_time = max(reset_time - time.time(), 0)
            if sleep_time > 0:
                print(f'Sleeping for {sleep_time} seconds due to rate limit...')
                time.sleep(sleep_time)

            response = requests.get(languages_url, headers=headers)
            remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))
            reset_time = int(response.headers.get('X-RateLimit-Reset', 0))

        if response.status_code == 200:
            languages_data = response.json()
            return languages_data, remaining_requests, reset_time

        return None, remaining_requests, reset_time

    except KeyboardInterrupt:
        pass


# Function to update the CSV file with the used languages and their percentages
def update_csv_file(csv_file, output_csv_file):
    with open(csv_file, 'r') as file:
        print('Opening the input file ------------------ ')
        reader = csv.DictReader(file)
        headers = reader.fieldnames + ['used_languages']
        rows = list(reader)

    with open(output_csv_file, 'w', newline='') as file:
        print('Opening the output file ------------------ ')
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()

        token_index = 0
        token = tokens[token_index]
        remaining_requests = None
        reset_time = None
        retry_count = 0

        def process_row(index, row):
            nonlocal retry_count, token_index, token, remaining_requests, reset_time

            languages_url = row['languages_url']

            if remaining_requests is not None and remaining_requests == 0:
                if retry_count == len(tokens) - 1:
                    print('Rate limit exceeded for all tokens. Saving the retrieved info and exiting...')
                    return
                else:
                    token_index = (token_index + 1) % len(tokens)
                    token = tokens[token_index]
                    print(f'Switching to token {token_index + 1}...')

                    retry_count += 1
                    print(f'Rate limit reached. Retrying in 2 minutes (Attempt {retry_count})...')
                    print('Sleeping for 2 minutes...')
                    time.sleep(120)  # Sleep for 2 minutes before retrying
                    remaining_requests = None
                    reset_time = None

            languages_data, remaining_requests, reset_time = fetch_languages(languages_url, token)

            if languages_data is not None:
                languages_info = []
                total_bytes = sum(languages_data.values())
                for language, bytes_count in languages_data.items():
                    language_percentage = (bytes_count / total_bytes) * 100
                    languages_info.append(f'{language}: {language_percentage:.2f}%')

                row['used_languages'] = ', '.join(languages_info)
                writer.writerow(row)
                print(f'Success: Scraped used languages for repository at index {index}')
            else:
                row['used_languages'] = ''
                writer.writerow(row)
                print(f'Failure: Failed to scrape used languages for repository at index {index}')

            if (index + 1) % 950 == 0:
                print('Sleeping for 1 minute...')
                time.sleep(60)  # Sleep for 1 minute after every 950 processed repos

        with ThreadPoolExecutor(max_workers=2) as executor:
            for index, row in enumerate(rows):
                executor.submit(process_row, index, row)

        print('All repositories processed successfully.')


input_csv_file = 'SecondCleannedRepositories.csv'
output_csv_file = 'GithubRepoData.csv'
update_csv_file(input_csv_file, output_csv_file)

#### Iterating Over Tokens

In [None]:
tokens = [
    "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",
    "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",
    "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"
]


def fetch_languages(languages_url, token):
    try:
        headers = {'Authorization': f'token {token}'}
        response = requests.get(languages_url, headers=headers)

        remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))

        if remaining_requests < 5 :
            print(f'No more remaining_requests  : {remaining_requests} , we gonna change the token ' )
            response = requests.get(languages_url, headers=headers) # added yesterday
            remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))

        response = requests.get(languages_url, headers=headers)
        remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))

        if response.status_code == 200:
            languages_data = response.json()
            return languages_data, remaining_requests

        return None, remaining_requests

    except KeyboardInterrupt:
        
        print(" Keyorad Intteruption !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ")
        response = str(input("Do you want complete scraping , Y / n "))
        
        if response == 'Y' or 'y' or 'yes':
            print("The Programm Will Continue Scraping ------------> ")
            return None, remaining_requests

        else:
            return None , None
            systeme.exit(0)   

    except requests.exceptions.RequestException:
        print('Connection error occurred. Retrying after 10 minutes...')
        time.sleep(200)
        
        return fetch_languages(languages_url, token)


# Function to update the CSV file with the used languages and their percentages : 

def update_csv_file(csv_file, output_csv_file , start_index):
    
    with open(csv_file, 'r') as file:
        print('Opening the input file ------------------ ')
        reader = csv.DictReader(file)
        headers = reader.fieldnames + ['used_languages']
        rows = list(reader)

    with open(output_csv_file, 'a', newline='') as file:
        print('Opening the output file ------------------ ')
        writer = csv.DictWriter(file, fieldnames = headers)

        retry_count = 0
        token_index = 0

        for index, row in enumerate(rows[start_index:], start = start_index):
            languages_url = row['languages_url']
            token = tokens[token_index]

            languages_data, remaining_requests = fetch_languages(languages_url, token)
            
            
            if languages_data is not None:
                                
                languages_info = []
                total_bytes = sum(languages_data.values())
                for language, bytes_count in languages_data.items():
                    language_percentage = (bytes_count / total_bytes) * 100
                    languages_info.append(f'{language}: {language_percentage:.2f}%')
                    
                row['used_languages'] = ', '.join(languages_info)
                writer.writerow(row) 
                
                print(f'Success: Scraped Used languages By Percentage for repository at index {index}')

                    
            else:
                print(f'Failure: Failed to scrape used languages for repository at index {index}')

            if (index + 1) % 950 == 0:
                print('Sleeping for 1 minute to avoid day rate limit issues  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
                time.sleep(60)
                
            if remaining_requests < 5:
                print(f'Rate limit exceeded. Switching to the next token (Attempt {retry_count + 1})...')
                token_index = (token_index + 1) % len(tokens)
                print(f'Switching to token index: {token_index}')
                time.sleep(60)

                
        else:
            print('All repositories processed successfully.')
                
                
                
input_csv_file = 'Data2.csv'
output_csv_file = 'ThirdData.csv'
start_index = int(input("Enter the index of the beginning: "))
update_csv_file(input_csv_file, output_csv_file , start_index)

## Get The Top 5 Contributers

#### Thread Pool Executer 

In [None]:
import requests
import csv
import time
from concurrent.futures import ThreadPoolExecutor

tokens = [
    "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",
    "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",
    "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",
    "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",
    "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",
    "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"
]


def fetch_contributors(contributors_url, token):
    try:
        headers = {'Authorization': f'token {token}'}
        response = requests.get(contributors_url, headers=headers)

        remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))
        reset_time = int(response.headers.get('X-RateLimit-Reset', 0))

        if remaining_requests == 0:
            retry_after = int(response.headers.get('Retry-After', 1))
            print(f'Rate limit reached. Retrying after {retry_after} seconds...')
            time.sleep(retry_after)
            response = requests.get(contributors_url, headers=headers)
            remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))
            reset_time = int(response.headers.get('X-RateLimit-Reset', 0))

        retry_count = 0

        while response.status_code == 403:
            print(f'Rate limit reached. Retrying after exponential backoff...')
            retry_after = 2 ** retry_count  # Exponential backoff
            time.sleep(retry_after)
            response = requests.get(contributors_url, headers=headers)
            remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))
            reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
            retry_count += 1

        if response.status_code == 200:
            contributors_data = response.json()

            # Extract contributor details
            contributors = []
            for contributor in contributors_data[:5]:
                username = contributor['login']
                contributions = contributor['contributions']
                contributor_type = contributor['type']

                contributors.append({
                    'username': username,
                    'contributions': contributions,
                    'type': contributor_type
                })

            return contributors, remaining_requests, reset_time

        return None, remaining_requests, reset_time

    except KeyboardInterrupt:
        pass

def update_csv_file(csv_file, output_csv_file):
    with open(csv_file, 'r+') as file:
        print(f'Opening the input file...')
        reader = csv.DictReader(file)
        headers = reader.fieldnames + ['top_contributors']
        rows = list(reader)

    with open(output_csv_file, 'w+', newline='') as file:
        print(f'Creating the output file...')
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()

        token_index = 0
        token = tokens[token_index]
        remaining_requests = None
        reset_time = None
        retry_count = 0

        def process_row(index, row):
            nonlocal retry_count, token_index, token, remaining_requests, reset_time

            contributors_url = row['contributers_url']
            token = tokens[token_index]

            contributors_data, remaining_requests, reset_time = fetch_contributors(contributors_url, token)

            row['top_contributors'] = ''

            if contributors_data is not None:
                top_contributors = [f"{contributor['username']} ({contributor['type']}) - {contributor['contributions']} contributions" for contributor in contributors_data]
                row['top_contributors'] = ', '.join(top_contributors)

            writer.writerow(row)

            # Log progress
            repo_url = row['url']
            if contributors_data is not None:
                print(f'Success: Scraped top contributors for repository at index {index}: {repo_url}')
            else:
                print(f'Failure: Failed to scrape top contributors for repository at index {index}: {repo_url}')

            if (index + 1) % 950 == 0:
                print('Sleeping for 1 minute...')
                time.sleep(60)  # Sleep for 1 minute after every 950 requests

            if remaining_requests == 0:
                retry_count += 1
                if retry_count == len(tokens):
                    print('Rate limit exceeded for all tokens. Exiting...')
                    return
                else:
                    token_index = (token_index + 1) % len(tokens)
                    print(f'Rate limit exceeded. Switching to the next token (Attempt {retry_count + 1})...')
                    print(f'Switching to token index: {token_index}')
                    print('Sleeping for 2 minutes...')
                    time.sleep(120)  # Sleep for 2 minutes before switching to the next token

        with ThreadPoolExecutor(max_workers=2) as executor:
            for index, row in enumerate(rows):
                executor.submit(process_row, index, row)

        print('All repositories processed successfully.')

input_csv_file = 'SecondCleannedRepositories.csv'
output_csv_file = 'finalData.csv'
update_csv_file(input_csv_file, output_csv_file)


#### Iterating Over Tokens

In [None]:
tokens = [
    "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",
    "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",
    "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"
]


def fetch_contributors(contributors_url, token):
    
    try:
        headers = {'Authorization': f'token {token}'}
        
        response = requests.get(contributors_url, headers=headers)
        remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))

        if remaining_requests < 5:
            print(f'No more remaining_requests  : {remaining_requests} , we gonna change the token ' )
            response = requests.get(contributors_url, headers=headers) # added yesterday
            remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))
            
        response = requests.get(contributors_url, headers=headers) # added yesterday
        remaining_requests = int(response.headers.get('X-RateLimit-Remaining', 0))

        
        if response.status_code == 200:
            contributors_data = response.json()

            # Extract contributor details
            contributors = []
            for contributor in contributors_data[:5]:
                username = contributor['login']
                contributions = contributor['contributions']
                contributor_type = contributor['type']

                contributors.append({
                    'username': username,
                    'contributions': contributions,
                    'type': contributor_type
                })

            return contributors, remaining_requests

        return None, remaining_requests

    except KeyboardInterrupt:
        
        print(" Keyorad Intteruption !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ")
        response = str(input("Do you want complete scraping , Y / n "))
        
        if response == 'Y' or 'y' or 'yes':
            print("The Programm Will Continue Scraping ------------> ")
            return None, remaining_requests

        else:
            return None , None
            systeme.exit(0)   

    except requests.exceptions.RequestException:
        print('Connection error occurred. Retrying after 10 minutes...')
        time.sleep(200)
        
        return fetch_languages(languages_url, token)


def update_csv_file(csv_file, output_csv_file , start_index):
    with open(csv_file, 'r+') as file:
        print(f'Opening the input file >>>>>>>>>>>>>>>>>')
        reader = csv.DictReader(file)
        headers = reader.fieldnames + ['top_contributors']
        rows = list(reader)

    with open(output_csv_file, 'a', newline='') as file:
        print(f'Openning the output file <<<<<<<<<<<<<<<')
        writer = csv.DictWriter(file, fieldnames=headers)

        retry_count = 0
        token_index = 0

        for index, row in enumerate(rows[start_index:], start = start_index):
            contributors_url = row['contributers_url']
            token = tokens[token_index]

            contributors_data, remaining_requests = fetch_contributors(contributors_url, token)

            if contributors_data is not None:
                top_contributors = [f"{contributor['username']} ({contributor['type']}) - {contributor['contributions']} contributions" for contributor in contributors_data]
                row['top_contributors'] = ', '.join(top_contributors)
                writer.writerow(row)
                print(f'Success: Scraped Used languages By Percentage for repository at index {index}')

            else:
                row['top_contributors'] = ''
                writer.writerow(row)
                print(f'Failure: Failed to scrape used languages for repository at index {index}')

            if (index + 1) % 950 == 0:
                print('Sleeping for 1 minute to avoid day rate limit issues  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
                time.sleep(60)
                
            if remaining_requests < 5:
                print(f'Rate limit exceeded. Switching to the next token (Attempt {retry_count + 1})...')
                token_index = (token_index + 1) % len(tokens)
                print(f'Switching to token index: {token_index}')

        else:
            print('All repositories processed successfully.')

input_csv_file = 'ThirdData.csv'
output_csv_file = 'finalData.csv'
start_index = int(input("Enter the index of the beginning: "))
update_csv_file(input_csv_file, output_csv_file ,start_index )