In [None]:
import json
import csv
import pandas
import os

This file will deal with json file obtained in data fetching and output them as csv



# Basic data 
In the below section you can assigne the destination of file and projects that you want to deal with

In [None]:
#Name of projects tat you have collected
project_list = ["numpy","networkx","OpenMetaData","tensorflow"]
raw_json_paths = {}
for project in project_list:
    raw_json_paths[project] = f"../Data/raw_json/{project}.json"
    print(f"../Data/raw_json/{project}.json")
output_csv_path = {}
for project in project_list:
    output_csv_path[project] = f"../Data/csv/{project}/"

In [None]:
project_code = {}
for i in range(len(project_list)):
    project_code[project_list[i]] = str(i)+ '0' +str(i)
print(project_code)

# Extract Author info
This section will output all nnecessary information about the author and committers into author.csv

In [None]:
def load_commit_data(filename):
    """Load commit data from a JSON file."""
    with open(filename, 'r', encoding='utf-8') as file:
        return json.load(file)

def extract_unique_users(commit_data):
    """Extract unique authors and committers from commit data, focusing on detailed GitHub user information."""
    authors = {}
    committers = {}
    commits = commit_data
    for commit in commits:
        author = commit.get('author')
        committer = commit.get('committer')

        if author:
            authors[author['id']] = {
                'login': author.get('login'),
                'id': author.get('id'),
                'url': author.get('html_url')
            }

        if committer:
            committers[committer['id']] = {
                'login': committer.get('login'),
                'id': committer.get('id'),
                'url': committer.get('html_url')
            }

    return list(authors.values()), list(committers.values())

def save_to_csv(data, filename):
    """Save detailed user data to a CSV file."""
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        fieldnames = ['login', 'id', 'url']  # Define the headers of the CSV file
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for user in data:
            writer.writerow(user)  # Write user data
    print(f"Data saved to {filename}")

In [None]:
for project in project_list:
    input_json = raw_json_paths[project]# Ensure this path is correct
    output_authors_csv = output_csv_path[project]+"unique_authors.csv"
    output_committers_csv = output_csv_path[project]+"unique_committers.csv"
    # Load commit data from JSON file
    commit_data = load_commit_data(input_json)
    print(f"Loaded {len(commit_data)} Commits")
    print(commit_data[0].keys())
    # Extract unique authors and committers
    authors, committers = extract_unique_users(commit_data)

    # Save the unique authors and committers to separate CSV files
    save_to_csv(authors, output_authors_csv)
    save_to_csv(committers, output_committers_csv)

            

# Commit_data
This section deals with commits, outcome will be saved to `commits.csv`

In [None]:
def save_commits_to_csv(json_file, csv_file):
    """Save commit data from a JSON file to a CSV file with fixed headers.

    Args:
        json_file (str): Path to the input JSON file containing commit data.
        csv_file (str): Path to the output CSV file to create.
    """
    # Fixed CSV headers
    csv_headers = ['self_sha', 'author_id', 'parent_sha', 'pr_id','date','committer_id']

    # Load JSON data from the specified file
    with open(json_file, 'r', encoding='utf-8') as file:
        commits = json.load(file)
    
    # Open the CSV file for writing
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=csv_headers)
        writer.writeheader()

        # Write each commit to the CSV file
        for commit in commits:
            if commit['pull_requests']:
                pr_id = commit['pull_requests'][0]['id']
            else:
                pr_id = -1  # Default PR ID if no pull request is associated
            # check is auhtor null  
            if commit['author']:
                author_id = commit['author']['id']
            else:
                author_id = -1
            writer.writerow({
                'self_sha': commit['sha'],
                'author_id': author_id,
                'parent_sha': commit['parents'][0]['sha'],
                'pr_id': pr_id,
                'date': commit['commit']['author']['date'],
                'committer_id':commit['committer']['id']
            })
    print(f'CSV file created successfully at {csv_file}')

In [None]:

for project in project_list:
    json_path = raw_json_paths[project]
    csv_output = output_csv_path[project]+"commits.csv"
    save_commits_to_csv(json_path,csv_output)

# Pull Request Data


In [None]:
def process_pull_requests(json_filepath, csv_filepath,project_code):
    print(f"Processing {csv_filepath}")
    # Load commits from JSON file
    with open(json_filepath, 'r', encoding='utf-8') as file:
        commits = json.load(file)
    
    pr_list = []
    for commit in commits:
        if 'pull_requests' in commit:
            pr = commit['pull_requests']
            pr_list.extend(pr)

    # CSV headers
    csv_headers = ['pr_id', 'author', 'head_sha', 'base_sha', 'created_at', 'merged_at', 'closed_at','reviewer_count']
    id_set = set()

    # Write to the CSV file
    with open(csv_filepath, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=csv_headers)
        writer.writeheader()
        for pr in pr_list:
            if pr['id'] in id_set:
                continue
            #print(pr['requested_reviewers'])
            reviwer_count = len(pr['requested_reviewers'])
            writer.writerow({
                'pr_id': project_code + str(pr['id']),
                'author': pr['user']['login'],
                'head_sha': pr['head']['sha'],
                'base_sha': pr['base']['sha'],
                'created_at': pr['created_at'],
                'closed_at': pr['closed_at'],
                'merged_at': pr['merged_at'],
                'reviewer_count':reviwer_count
            })
            id_set.add(pr['id'])  # Update the set to prevent duplicates

    print(f"Data written to {csv_filepath} successfully.")

# Example usage

In [None]:
for project in project_list:
    json_path = raw_json_paths[project]
    csv_path = output_csv_path[project]+"pull_requests.csv"
    process_pull_requests(json_path,csv_path,project_code[project])

Collecting Reviewers

In [None]:
def process_reviewers(json_filepath, csv_filepath,project_code):
    print(f"Processing {csv_filepath}")
    # Load commits from JSON file
    with open(json_filepath, 'r', encoding='utf-8') as file:
        commits = json.load(file)
    
    pr_list = []
    for commit in commits:
        if 'pull_requests' in commit:
            pr = commit['pull_requests']
            pr_list.extend(pr)

    # CSV headers
    csv_headers = ['pr_id','reviewer_id','reviewer_login']
    id_set = set()

    # Write to the CSV file
    with open(csv_filepath, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=csv_headers)
        writer.writeheader()
        for pr in pr_list:
            if pr['id'] in id_set:
                continue
            pr_id = project_code + str(pr['id'])
            if not pr['requested_reviewers']:
                continue
            for reviewer in pr['requested_reviewers']:
                writer.writerow({
                    'pr_id':pr_id,
                    'reviewer_id':reviewer['id'],
                    'reviewer_login':reviewer['login']
                })
            id_set.add(pr['id'])  # Update the set to prevent duplicates

    print(f"Data written to {csv_filepath} successfully.")


# Merge to single csv file

In [None]:
for project in project_list:
    json_path = raw_json_paths[project]
    csv_path = output_csv_path[project]+"reviewer.csv"
    process_reviewers(json_path,csv_path,project_code[project])

In [None]:
# CSV merging
csv_name_list = ['commits.csv','pull_requests.csv','reviewer.csv','unique_authors.csv','unique_committers.csv']
out_path_base = "../Data/csv/Merged/"
for csv_name in csv_name_list:
    dfs = []
    for project in project_list:
        csv_file = output_csv_path[project]+csv_name
        if not os.path.exists(csv_file):
            raise Exception(f"csv_file:{csv_file} Not found, Aborting") 
        df = pandas.read_csv(csv_file)
        dfs.append(df)
    merged_df = pandas.concat(dfs, ignore_index=True)
    merged_df.to_csv(out_path_base+csv_name)
    print(f"File saved to {out_path_base+csv_name}")




In [None]:
# Good no RSA colash
out_path_base = "../Data/csv/Merged/"
commit_df = pandas.read_csv(out_path_base+"commits.csv")
if not len(commit_df) == len(commit_df['self_sha'].unique()):
    raise Exception("RSA colash in prject")
else:
    print("commit RSA all good")

In [None]:
pull_request_df = pandas.read_csv('../Data/csv/Merged/pull_requests.csv')
if not len(pull_request_df['pr_id'].unique())==len(pull_request_df):
    print("There are duplicate PR_id")
else:
    print("Pr_id all Good")

In [None]:
authors_df = pandas.read_csv('../Data/csv/Merged/unique_authors.csv')
if not len(authors_df) == len(authors_df['id'].unique()):
    print("There are duplicated entries in orginal file")
    duplicates = authors_df[authors_df.duplicated(subset='id', keep='first')]
    df_unique = authors_df.drop_duplicates(subset='id',keep='first')
    df_unique.to_csv('../Data/csv/Merged/authors.csv',index=False,header=True)
else:
    print("No duplicates")
authors_df = pandas.read_csv('../Data/csv/Merged/authors.csv')
if not len(authors_df) == len(authors_df['id'].unique()):
    print("There are duplicated entries")
    duplicates = authors_df[authors_df.duplicated(subset='id', keep='first')]
    print(duplicates[:10])
    raise Exception("Still duplication after dropping")
else:
    print("File saved to author.csv")


In [None]:
authors_df = pandas.read_csv('../Data/csv/Merged/unique_committers.csv')
if not len(authors_df) == len(authors_df['id'].unique()):
    print("There are duplicated entries in orginal file")
    duplicates = authors_df[authors_df.duplicated(subset='id', keep='first')]
    df_unique = authors_df.drop_duplicates(subset='id',keep='first')
    df_unique.to_csv('../Data/csv/Merged/committers.csv',index=False,header=True)
else:
    print("No duplicates")
authors_df = pandas.read_csv('../Data/csv/Merged/committers.csv')
if not len(authors_df) == len(authors_df['id'].unique()):
    print("There are duplicated entries")
    duplicates = authors_df[authors_df.duplicated(subset='id', keep='first')]
    print(duplicates[:10])
    raise Exception("Still duplication after dropping")
else:
    print("File saved to Committer.csv")
