<div class="alert alert-info">
    <center><b>Set up Notebook</b></center>
</div>

In [None]:
%pip install os tqdm psycopg2 dotenv re

In [2]:
import os
from tqdm import tqdm
import psycopg2
from dotenv import load_dotenv
import re
from types import List, Dict, Any

In [3]:
load_dotenv()
DB_PASSWORD = os.getenv('DB_PASSWORD')

<div class="alert alert-info">
    <center><b>Add all methods used</b></center>
</div>

In [4]:
def db_conn(db: str, password: str, user: str) -> psycopg2.extensions.connection:
    """Connects to the specified database.
    
    Args:
        db (str) - The name of the database to connect to.\n
        password (str) - The password for the user to connect with.\n
        user (str) - The user to connect with.\n
    
    Returns:
        psycopg2.extensions.connection: The connection object.
    """
    return psycopg2.connect(
        database = db,
        user = user,
        host = 'localhost',
        password = password,
        port = '5432'        
    )

In [5]:
def add_file(file_name: str, sha: str, diff: str, conn: psycopg2.extensions.connection) -> None:
    """Adds a file version to the database.
    
    Parameters:
        file_name (str) - the name of the file\n
        sha (str) - the sha of the commit\n
        diff (str) - the diff of the file\n
        conn (psycopg2.extensions.connection) - the connection to the database
        
    Returns:
        None
    """
    cursor = conn.cursor()
    file_type = file_name.split('.')[-1]
    cursor.execute(f"INSERT INTO files (name, sha, type, diff) VALUES (%s, %s, %s, %s);", (file_name, sha, file_type, diff))
    conn.commit()
    cursor.close()

In [6]:
def get_altered_files(diff_text):
    """Extracts the names of files altered in a git diff text.

    Parameters:
        diff_text (str) - The git diff text.

    Returns:
        List[str] - A list of file names that were altered in the commit.
    """
    pattern = r"^diff --git a/(.+?) b/\1"
    
    matches = re.findall(pattern, diff_text, re.MULTILINE)

    return matches

In [None]:
def parse_hunks(diff_text: str) -> List[Dict[str, List[str]]]:
    """Parses the hunks in a diff text into structured data.

    Parameters:
        diff_text (str): The diff text for a specific file.

    Returns:
        List[Dict[str, Any]]: A list of hunks, each represented as a dictionary
                              with 'header' and 'lines' keys.
    """
    hunk_pattern = r"(^@@ .+? @@.*$)"  # Match hunk headers
    sections = re.split(hunk_pattern, diff_text, flags=re.MULTILINE)

    hunks = []
    for i in range(1, len(sections), 2):
        header = sections[i]
        lines = sections[i + 1].strip().splitlines() if i + 1 < len(sections) else []
        hunks.append({"header": header, "lines": lines})

In [7]:
def get_diffs_by_file(diff_text: str) -> Dict[str, Dict[str, Any]]:
    """Extracts the diff content for each file in the git diff text, including types and hunks.

    Parameters:
        diff_text (str): The git diff text.

    Returns:
        Dict[str, Dict[str, Any]]: A dictionary where keys are file names and values are dictionaries
                                   containing 'type', 'a_diff', 'b_diff', and 'hunks'.
    """
    split_pattern = r"(^diff --git a/.+? b/.+?$)"
    sections = re.split(split_pattern, diff_text, flags=re.MULTILINE)

    files = []
    file_data = []
    for i in range(1, len(sections), 2):
        header = sections[i]
        body = sections[i + 1] if i + 1 < len(sections) else ""

        match = re.search(r"^diff --git a/(.+?) b/(.+?)$", header, re.MULTILINE)
        if match:
            file_a = match.group(1)
            file_b = match.group(2)

            if "deleted file mode" in body:
                file_type = "deleted"
            elif "new file mode" in body:
                file_type = "created"
            elif "similarity index" in body and "copy from" in body:
                file_type = "copied"
            elif "old mode" in body and "new mode" in body:
                file_type = "permissions changed"
            else:
                file_type = "modified" if file_a == file_b else "renamed"

            files.append(file_a)  # Storing file A as the key
            file_data.append({
                "type": file_type,
                "a_diff": body.split('--- ')[1].split('+++ ')[0] if '--- ' in body and '+++ ' in body else "",
                "b_diff": body.split('+++ ')[1] if '+++ ' in body else "",
                "hunks": parse_hunks(body) if "@@" in body else []
            })

    return {file: data for file, data in zip(files, file_data)}

<div class="alert alert-info">
    <center><b>Fetching all commits from the database for later usage on the addition of file versions present in the commits to the database</b></center>
</div>

In [8]:
conn = db_conn('code_samples', 'codesamples', 'codesamples_user')
cursor = conn.cursor()

cursor.execute(f"SELECT sha, diff FROM commits;")
commits = cursor.fetchall()

cursor.close()

<div class="alert alert-info">
    <center><b>Adding the versions of files present in all the commits of the database</b></center>
</div>

In [9]:
for i in tqdm(range(len(commits)), desc="Adding files to database"):	
    sha = commits[i][0]
    message = commits[i][1]
    diffs = get_diffs_by_file(message)
    for file, diff in diffs.items():
        add_file(file, sha, diff, conn)
        
conn.close()

Adding files to database: 100%|██████████| 30988/30988 [04:39<00:00, 110.86it/s]
