In [None]:
import pandas as pd
import requests
from tqdm import tqdm
import json
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import time


def get_most_used_language(owner, repo, api_token):
    url = f"https://api.github.com/repos/{owner}/{repo}/languages"
    
    # Randomly select an API token
    # api_token = random.choice(api_tokens)
    headers = {"Authorization": f"token {api_token}"}
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        languages = response.json()
        if languages:
            # Find the most used language by finding the max value in the dictionary
            most_used_language = max(languages, key=languages.get)
            return most_used_language
        else:
            return "Not Available"
    elif response.status_code == 403:
        # Handle rate limit errors
        return "Rate limit exceeded. Consider using a different API token."
    else:
        return f"Failed to retrieve languages for {owner}/{repo}. Status Code: {response.status_code}"



def fetch_commit_details(github_commit_link, api_tokens):
    try:
        parts = github_commit_link.split('/')
        owner = parts[-4]
        repo = parts[-3]
        commit_sha = parts[-1]
        api_url = f"https://api.github.com/repos/{owner}/{repo}/commits/{commit_sha}"
        project = repo

        # Randomly select an API token and rotate the list
        api_token = random.choice(api_tokens)
        headers = {"Authorization": f"token {api_token}"}
        
        response = requests.get(api_url, headers=headers)

        if response.status_code == 200:
            commit_details = response.json()
            return commit_details, api_url, project, owner, repo, api_token
        else:
            print(f"{response.status_code} - Failed to fetch commit details for {github_commit_link}")
            return None
    except Exception as e:
        print(f"Error fetching commit details for {github_commit_link}: {e}")
        return None

def update_csv_with_commit_details(csv_file_path, api_tokens, new_csv_file_path):
    # df = pd.read_csv(csv_file_path,nrows=10)
    df = pd.read_csv(csv_file_path)
    df["language"] = ""
    df['api_url'] = ''
    df['author'] = ''
    df['email'] = ''
    df["project"] = ""
    df["commit_id"] = ""
    df["commit_message"] = ""
    df["files_changed"] = ""

    def process_row(index, row):
        result = fetch_commit_details(row['github_commit_link'], api_tokens)
        if result:
            commit_details, api_url, project, owner, repo, api_token = result
            
            objects_as_strings = [json.dumps(file) for file in commit_details['files']]
            csv_string = '<_**next**_>'.join(objects_as_strings)
            
            df.at[index, 'language'] = get_most_used_language(owner, repo, api_token)
            df.at[index, 'api_url'] = api_url
            df.at[index, 'author'] = commit_details['commit']['author']['name']
            df.at[index, 'email'] = commit_details['commit']['author']['email']
            df.at[index, 'project'] = project
            df.at[index, 'commit_id'] = commit_details['sha']
            df.at[index, 'commit_message'] = commit_details['commit']['message']
            df.at[index, 'files_changed'] = csv_string
        else:
            return index  # Return the index to drop later if needed
        
        time.sleep(0.3)  # Add a delay to avoid rate limits

    with ThreadPoolExecutor(max_workers=1) as executor:  # Adjust the number of workers as needed
        futures = {executor.submit(process_row, index, row): index for index, row in df.iterrows()}
        drop_indices = []

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing rows"):
            try:
                result = future.result()
                if result is not None:
                    drop_indices.append(result)
            except Exception as e:
                print(f"Error processing row: {e}")

    # Drop rows where the commit details could not be fetched
    if drop_indices:
        df.drop(drop_indices, inplace=True)

    df.to_csv(new_csv_file_path, index=False)  # Save to a new CSV file

    # Upload the file to MongoDB
    print("Uploading to MongoDB 🚀")
    # upload_csv_to_mongodb(new_csv_file_path)

    # Upload the file to AWS S3
    print("Uploading to AWS S3 🚀")
    
    # Send Slack notification
    # send_slack_message("Data Extraction and Upload Completed Successfully! 🎉")

# Example usage:
api_tokens = [

]

csv_file_path = 'all_github_commits_with_vulnerablity_2002-2024.csv'


new_csv_file_path ="all_commit_with_details_2002-24.csv"


update_csv_with_commit_details(csv_file_path, api_tokens, new_csv_file_path)
print("Data Extraction and Upload Completed Successfully! 🎉")