In [None]:
# Similar script to what was used for data-fetching
#!/usr/bin/env python3
import requests
import json
import base64
import logging
import io
import os
from urllib.parse import quote
from unidiff import PatchSet

logging.basicConfig(level=logging.INFO)

GITEA_URL = ""
OWNER = ""
REPO = ""

# Replace with your actual API token,
API_TOKEN = ""

headers = {
    "Authorization": f"token {API_TOKEN}",
    "Accept": "application/json"
}

base_api_url = f"{GITEA_URL}/api/v1"


def get_all_pull_requests(state="all", limit=50):
    """
    Fetch *all* pull requests for the repo, using pagination.
    """
    pulls_url = f"{base_api_url}/repos/{quote(OWNER)}/{quote(REPO)}/pulls"
    pulls_url += f"?state={state}&limit={limit}"

    pull_requests = []
    page = 1

    while True:
        paged_url = f"{pulls_url}&page={page}"
        try:
            resp = requests.get(paged_url, headers=headers, timeout=120)
            resp.raise_for_status()
            current_page_pulls = resp.json()
            if not current_page_pulls:
                break
            pull_requests.extend(current_page_pulls)
            logging.info(f"Fetched page {page} with {len(current_page_pulls)} pull requests.")
            page += 1
        except requests.exceptions.RequestException as e:
            logging.error(f"Error fetching pull requests on page {page}: {e}")
            break

    return pull_requests


def get_pr_data(pr_number):
    url = f"{base_api_url}/repos/{quote(OWNER)}/{quote(REPO)}/pulls/{pr_number}"
    try:
        resp = requests.get(url, headers=headers, timeout=120)
        resp.raise_for_status()
        return resp.json()
    except requests.exceptions.RequestException as e:
        logging.error(f"[PR #{pr_number}] Error fetching PR data: {e}")
        return None


def get_pr_diff(pr_number):
    url = f"{base_api_url}/repos/{quote(OWNER)}/{quote(REPO)}/pulls/{pr_number}.diff"
    try:
        resp = requests.get(url, headers={
            "Authorization": f"token {API_TOKEN}",
            "Accept": "text/plain"
        }, timeout=120)
        resp.raise_for_status()
        return resp.text
    except requests.exceptions.RequestException as e:
        logging.error(f"[PR #{pr_number}] Error fetching diff: {e}")
        return None


def get_reviews(pr_number):
    url = f"{base_api_url}/repos/{quote(OWNER)}/{quote(REPO)}/pulls/{pr_number}/reviews"
    try:
        resp = requests.get(url, headers=headers, timeout=120)
        resp.raise_for_status()
        return resp.json()
    except requests.exceptions.RequestException as e:
        logging.error(f"[PR #{pr_number}] Error fetching reviews: {e}")
        return []


def get_review_comments(pr_number, review_id):
    url = f"{base_api_url}/repos/{quote(OWNER)}/{quote(REPO)}/pulls/{pr_number}/reviews/{review_id}/comments"
    try:
        resp = requests.get(url, headers=headers, timeout=120)
        resp.raise_for_status()
        return resp.json()
    except requests.exceptions.RequestException as e:
        logging.error(f"[PR #{pr_number}] Error fetching review #{review_id} comments: {e}")
        return []


def get_file_content_at_commit(file_path, commit_sha):
    encoded_path = quote(file_path, safe="")
    url = f"{base_api_url}/repos/{quote(OWNER)}/{quote(REPO)}/contents/{encoded_path}?ref={commit_sha}"
    try:
        resp = requests.get(url, headers=headers, timeout=120)
        if resp.status_code == 200:
            content_json = resp.json()
            encoded_content = content_json.get('content')
            if encoded_content:
                return base64.b64decode(encoded_content).decode('utf-8', errors='replace')
        else:
            logging.warning(f"[Commit {commit_sha}] File not found: {file_path}")
    except requests.exceptions.RequestException as e:
        logging.error(f"[Commit {commit_sha}] Error fetching file {file_path}: {e}")

    return None


def handle_pr(pr):
    """
    Process a single PR:
      - Get PR data for base commit
      - Fetch the .diff and parse it
      - Collect all review comments by (file_path, line_number)
      - Iterate over patches/hunks, output only those with comments
      - Return a list of JSON-serializable objects
    """
    pr_number = pr["number"]
    pr_data = get_pr_data(pr_number)
    if not pr_data:
        return []

    base_sha = pr_data["base"]["sha"]
    diff_text = get_pr_diff(pr_number)
    if not diff_text:
        return []

    patch = PatchSet(io.StringIO(diff_text))

    # Collect review comments
    reviews = get_reviews(pr_number)
    comments_by_file_and_line = {}

    for review in reviews:
        review_id = review['id']
        review_comments = get_review_comments(pr_number, review_id)
        for c in review_comments:
            file_path = c.get('path')
            diff_line_number = c.get('line') or c.get('position')
            body = c.get('body', "")
            if file_path and diff_line_number is not None:
                comments_by_file_and_line.setdefault((file_path, diff_line_number), []).append(body)

    # Now iterate over each file/hunk to find commented lines
    results = []
    for patched_file in patch:
        old_file_path = patched_file.source_file[2:] if patched_file.source_file.startswith("a/") else patched_file.source_file
        new_file_path = patched_file.target_file[2:] if patched_file.target_file.startswith("b/") else patched_file.target_file

        old_file_content = get_file_content_at_commit(old_file_path, base_sha)
        if old_file_content is None:
            old_file_content = ""

        for hunk in patched_file:
            patch_str = str(hunk)  # includes the @@ ... @@ header and +/- lines

            all_comments = []
            for line in hunk:
                t_line_no = line.target_line_no
                if (new_file_path, t_line_no) in comments_by_file_and_line:
                    all_comments.extend(comments_by_file_and_line[(new_file_path, t_line_no)])

            if not all_comments:
                continue

            combined_msg = "\n".join(all_comments)
            data_entry = {
                "oldf": old_file_content,
                "patch": patch_str,
                "msg": combined_msg,
                "id": pr_number,
                "y": 1
            }
            results.append(data_entry)

    return results


def main():
    logging.info("Fetching all pull requests (state=all, limit=50) ...")
    pull_requests = get_all_pull_requests(state="all", limit=50)
    logging.info(f"Total PRs fetched: {len(pull_requests)}")

    output_filename = ".jsonl"
    with open(output_filename, "w", encoding="utf-8") as outf:
        for pr in pull_requests:
            pr_num = pr["number"]
            logging.info(f"Processing PR #{pr_num} ...")
            hunk_data_list = handle_pr(pr)
            for data_entry in hunk_data_list:
                json.dump(data_entry, outf, ensure_ascii=False)
                outf.write("\n")

    logging.info(f"Done. Output written to {output_filename}")


if __name__ == "__main__":
    main()


In [None]:
# Translate natural language
import json
from googletrans import Translator

def extract_and_translate_jsonl(
    input_jsonl_path= r"directory",
    translated_txt_path= r"directory"
):
    translator = Translator()

    with open(input_jsonl_path, "r", encoding="utf-8") as infile, \
         open(translated_txt_path, "w", encoding="utf-8") as txtfile:
        
        line_count = 0
        
        for line in infile:
            line = line.strip()
            if not line:
                txtfile.write("\n")
                continue
            
            line_count += 1
            
            try:
                record = json.loads(line)
            except json.JSONDecodeError as e:
                txtfile.write(f"<JSON_DECODE_ERROR: {str(e)}>\n")
                continue

            msg = record.get("msg", "")
            
            if not msg:
                txtfile.write("\n")
                continue
            
            try:
                translation = translator.translate(msg, dest='en')
                if not translation or translation.text is None:
                    english_msg = msg
                else:
                    english_msg = translation.text
            except Exception as e:
                english_msg = f"<TRANSLATION_ERROR: {str(e)}> Original: {msg}"
            
            english_msg = english_msg.replace("\n", " ")
            
            txtfile.write(english_msg + "\n")
        
        print(f"Translation complete! Processed {line_count} JSON lines.")
        print("Check and edit the output in", translated_txt_path)

if __name__ == "__main__":
    extract_and_translate_jsonl(
        input_jsonl_path=r"",
        translated_txt_path=r""
    )


In [None]:
# script that compares if training data and test data might have the same objects
import json
import os

def build_identifier(obj, fields):
    try:
        return tuple(obj[field] for field in fields)
    except KeyError as e:
        missing_field = e.args[0]
        return None

def load_identifiers(file_path, fields):
    identifiers = set()
    with open(file_path, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, start=1):
            line = line.strip()
            if not line:
                continue  
            try:
                obj = json.loads(line)
                identifier = build_identifier(obj, fields)
                if identifier:
                    identifiers.add(identifier)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_number} in {file_path}: {e}")
    return identifiers

def find_duplicates(training_file, testing_file, output_file, fields):
    print("Loading training data identifiers...")
    training_identifiers = load_identifiers(training_file, fields)
    print(f"Total unique identifiers in training data: {len(training_identifiers)}")
    
    duplicates_count = 0
    
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    print("Comparing testing data against training data identifiers...")
    with open(testing_file, 'r', encoding='utf-8') as test_file, \
         open(output_file, 'w', encoding='utf-8') as dup_file:
        
        for line_number, line in enumerate(test_file, start=1):
            line = line.strip()
            if not line:
                continue 
            try:
                obj = json.loads(line)
                identifier = build_identifier(obj, fields)
                if identifier and identifier in training_identifiers:
                    dup_file.write(line + '\n')
                    duplicates_count += 1
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_number} in {testing_file}: {e}")
    
    return duplicates_count

def main():
    training_file_path = r""
    testing_file_path = r""
    duplicates_output_path = r""
    
    compare_fields = ["oldf", "patch", "msg", "id"]
    
    duplicates_found = find_duplicates(
        training_file=training_file_path,
        testing_file=testing_file_path,
        output_file=duplicates_output_path,
        fields=compare_fields
    )
    
    print(f"Duplicate comparison completed.")
    print(f"Total duplicates found: {duplicates_found}")
    print(f"Duplicates have been written to: {duplicates_output_path}")

if __name__ == "__main__":
    main()


In [None]:
# Script to prepare the alpaca data format
import json
from tqdm import tqdm

# Define file paths
input_file_path = r''
output_file_path = r''

instruction_text = (
    "You are a powerful code reviewer model for the c# programming language. Your job is to suggest "
    "review comments in natural language. You are given a context "
    "regarding a diff hunk (code change) in programming language. "
    "You must output appropriate, contextual review comment for that code change."
)

# Process and convert the JSONL file
with open(input_file_path, 'r', encoding='utf-8') as infile, \
     open(output_file_path, 'w', encoding='utf-8') as outfile:
    
    for line in tqdm(infile, desc="Converting JSONL to Alpaca Format"):
        try:
            data = json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSON line: {e}")
            continue
        patch = data.get('patch', '').strip()
        msg = data.get('msg', '').strip()
        
        new_obj = {
            "instruction": instruction_text,
            "input": f"Diff Hunk:\n{patch}",
            "output": msg
        }
        
        json_output = json.dumps(new_obj, ensure_ascii=False)
        outfile.write(json_output + '\n')

print(f"Conversion completed. Output saved to '{output_file_path}'.")


In [None]:
# Add two jsonl files into one and shuffle them
import json
import random
import os

def merge_and_shuffle_jsonl(
    input_file1,
    input_file2,
    output_file,
    seed=None
):
    
    if seed is not None:
        random.seed(seed)
        print(f"Random seed set to {seed}.")

    for file_path in [input_file1, input_file2]:
        if not os.path.isfile(file_path):
            raise FileNotFoundError(f"Input file '{file_path}' does not exist.")

    print(f"Reading lines from '{input_file1}'...")
    with open(input_file1, 'r', encoding='utf-8') as file1:
        lines1 = file1.readlines()
    print(f"Number of lines in '{input_file1}': {len(lines1)}")

    print(f"Reading lines from '{input_file2}'...")
    with open(input_file2, 'r', encoding='utf-8') as file2:
        lines2 = file2.readlines()
    print(f"Number of lines in '{input_file2}': {len(lines2)}")

    combined_lines = lines1 + lines2
    print(f"Total lines after merging: {len(combined_lines)}")

    print("Shuffling the combined lines...")
    random.shuffle(combined_lines)

    print(f"Writing shuffled lines to '{output_file}'...")
    with open(output_file, 'w', encoding='utf-8') as outfile:
        outfile.writelines(combined_lines)

    print("Merging and shuffling completed successfully!")

def validate_jsonl(file_path, sample_size=5):
    import json

    print(f"Validating JSON objects in '{file_path}'...")
    with open(file_path, 'r', encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            if i >= sample_size:
                break
            try:
                json_obj = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Invalid JSON on line {i+1}: {e}")
                return False
    print(f"First {sample_size} lines are valid JSON objects.")
    return True

if __name__ == "__main__":
    INPUT_FILE1 = r"" 
    INPUT_FILE2 = r""
    OUTPUT_FILE = r""

    SEED = 42  # example

    merge_and_shuffle_jsonl(
        input_file1=INPUT_FILE1,
        input_file2=INPUT_FILE2,
        output_file=OUTPUT_FILE,
        seed=SEED
    )

    is_valid = validate_jsonl(OUTPUT_FILE, sample_size=10)
    if is_valid:
        print(f"Validation passed for '{OUTPUT_FILE}'.")
    else:
        print(f"Validation failed for '{OUTPUT_FILE}'. Please check the file for errors.")


In [None]:
# Seperates a script into 4 parts for the cls task in the codeReviwer
import random
import os

def split_random_lines(
    input_file_path,
    output_file_paths,
    total_selected_lines,
    split_counts,
    seed=None
):
    if len(output_file_paths) != len(split_counts):
        raise ValueError("The number of output files must match the number of split counts.")

    if seed is not None:
        random.seed(seed) 
    if not os.path.isfile(input_file_path):
        raise FileNotFoundError(f"Input file '{input_file_path}' does not exist.")

    print(f"Reading lines from '{input_file_path}'...")
    with open(input_file_path, 'r', encoding='utf-8') as infile:
        all_lines = infile.readlines()

    total_available = len(all_lines)
    print(f"Total lines available in input file: {total_available}")

    if total_selected_lines > total_available:
        raise ValueError(
            f"Requested {total_selected_lines} lines, but only {total_available} are available."
        )

    print(f"Selecting {total_selected_lines} random lines...")
    selected_lines = random.sample(all_lines, total_selected_lines)

    print("Splitting selected lines into output files...")
    current_index = 0
    for output_path, count in zip(output_file_paths, split_counts):
        subset = selected_lines[current_index:current_index + count]
        print(f"Writing {count} lines to '{output_path}'...")
        with open(output_path, 'w', encoding='utf-8') as outfile:
            outfile.writelines(subset)
        current_index += count

    print("Splitting completed successfully!")

if __name__ == "__main__":
    # Configuration
    INPUT_FILE = r""

    OUTPUT_FILES = [ # Set 4 dir files
    ]

    TOTAL_SELECTED_LINES = 19111

    SPLIT_COUNTS = [4778, 4778, 4777, 4778] # example

    SEED = 42  # example

    split_random_lines(
        input_file_path=INPUT_FILE,
        output_file_paths=OUTPUT_FILES,
        total_selected_lines=TOTAL_SELECTED_LINES,
        split_counts=SPLIT_COUNTS,
        seed=SEED
    )


In [None]:
# This script checks the number of json objects that have y:1 or y: 0
import json

def count_y_zero(file_path):
    count = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                if obj.get('y') == 1:
                    count += 1
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_number}: {e}")
    return count

if __name__ == "__main__":
    file_path = r""   
    total = count_y_zero(file_path)
    print(f"Total objects with \"y\": 1 -> {total}")


In [None]:
# prepare the jsonl into alpaca format for codeLlama
import json

new_data = []
with open("original_data.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        old_obj = json.loads(line)
        
        new_obj = {
            "instruction": (
                "You are a powerful c# code reviewer model. Your job is to suggest "
                "a review comment in natural language. You are given a context "
                "regarding a diff hunk or code change in programming language. "
                "You must output appropriate, contextual review comment for that code change."
            ),
            "input": f"Diff Hunk:\n{old_obj['patch']}",
            "output": old_obj["msg"]
        }
        
        new_data.append(new_obj)

with open("converted_alpaca_format.jsonl", "w", encoding="utf-8") as fout:
    for item in new_data:
        fout.write(json.dumps(item, ensure_ascii=False) + "\n")


In [None]:
import chardet

input_file = r''

with open(input_file, 'rb') as f:
    result = chardet.detect(f.read())
    print(f"Detected Encoding: {result['encoding']}")


In [None]:
# Clean json- objects from BOMs
import json
import sys
import os
import unicodedata

def clean_string(s):
    # Remove BOM if present
    if s.startswith('\ufeff'):
        print("Found BOM at the start of the string. Removing it.")
        s = s.lstrip('\ufeff')
    s = unicodedata.normalize('NFC', s)
    
    return s

def remove_boms_and_validate_utf8(input_file, output_file, field_to_process):
    try:
        if not os.path.exists(input_file):
            print(f"Error: The input file '{input_file}' does not exist.", file=sys.stderr)
            return
        
        with open(input_file, 'r', encoding='utf-8') as infile, \
             open(output_file, 'w', encoding='utf-8') as outfile:
            
            line_num = 0
            issues_found = 0
            for line in infile:
                line_num += 1
                line = line.strip()
                if not line:
                    continue
                try:
                    json_obj = json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"Warning: Skipping invalid JSON on line {line_num}: {e}", file=sys.stderr)
                    issues_found += 1
                    continue

                if field_to_process in json_obj:
                    original_value = json_obj[field_to_process]
                    if not isinstance(original_value, str):
                        print(f"Warning: The field '{field_to_process}' on line {line_num} is not a string.", file=sys.stderr)
                        issues_found += 1
                    else:
                        cleaned_value = clean_string(original_value)
                        if cleaned_value != original_value:
                            print(f"Info: Cleaned the field '{field_to_process}' on line {line_num}.", file=sys.stderr)
                            issues_found += 1
                        json_obj[field_to_process] = cleaned_value

                try:
                    json_line = json.dumps(json_obj, ensure_ascii=False)
                    outfile.write(json_line + '\n')
                except (TypeError, OverflowError) as e:
                    print(f"Warning: Could not serialize JSON object on line {line_num}: {e}", file=sys.stderr)
                    issues_found += 1
                    continue

            print(f"Processing complete. Modified JSONL written to '{output_file}'.")
            if issues_found > 0:
                print(f"Note: {issues_found} issues were found during processing. Check the warnings above for details.", file=sys.stderr)
    
    except FileNotFoundError as e:
        print(f"Error: {e}", file=sys.stderr)
    except IOError as e:
        print(f"I/O error: {e}", file=sys.stderr)


input_file_path =
output_file_path =

field_to_process = 'oldf' # field that we need to clean

remove_boms_and_validate_utf8(input_file_path, output_file_path, field_to_process)
