In [None]:
%pip install GitPython pandas tqdm pathlib psycopg2

In [None]:
from git import Repo
import os
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import pytz
from dataclasses import dataclass
import pickle
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed
import subprocess
import psutil
import gc
import psycopg2

In [None]:
def db_conn():
    return psycopg2.connect(
        database = 'code_samples',
        user = 'postgres',
        host = 'localhost',
        password = 'codesamples',
        port = '5432'        
    )

In [None]:
def clone(gitUrl: str, repoDir: str, sample: str) -> None:
    '''Clone a git repository and checkout all files in the repository
    
    Args:
    gitUrl (str): URL of the git repository
    repoDir (str): Directory to clone the repository to
    sample (str): Name of the sample
        
    Returns:
        None'''
    repo_path = os.path.join(repoDir, sample)
    os.makedirs(repo_path, exist_ok=True)

    repo = Repo.clone_from(gitUrl, repo_path, multi_options=["--no-checkout"])

    try:
        repo.git.reset('--hard', 'HEAD') # Reset the working tree to HEAD

        repo.git.checkout('--', '.') # Partial checkout in batches
    except Exception as e:
        print(f"Error checking out files for {sample}: {e}")

In [None]:
def download(sample: str) -> None:
    '''Download the repository
    
    Args:
    sample (str): Name of the sample
    
    Returns:
        None'''
    gitHubUrl = f"https://github.com/{sample}.git"
    repoDir = "repositories/"
    isdir = os.path.isdir(repoDir+sample)
    if isdir:
        return
    else:
        clone(gitHubUrl, repoDir, sample)

In [None]:
@dataclass
class RawData:
    full_path: str
    timestamp: datetime
    sha: str
    message: str
    diff: str
    
    def __str__(self):
        return f"-{self.sha}\n- {self.message}\n- {self.timestamp}\n- {self.diff}"
    
    def __repr__(self):
        return self.__str__()

In [None]:
def save_data_in_batches(batch_data: List[RawData], batch_index: int):
    with open(f'raw_data_batch_{batch_index}.pkl', 'wb') as p:
        pickle.dump(batch_data, p)

In [None]:
def check_memory():
    process = psutil.Process(os.getpid())
    memory_used_mb = process.memory_info().rss / 1024 / 1024
    return memory_used_mb

In [None]:
def free_memory():
    gc.collect()

In [None]:
conn = db_conn()
cursor = conn.cursor()
cursor.execute(f"""CREATE TABLE IF NOT EXISTS raw_data (
     full_path TEXT,
     timestamp TIMESTAMP,
     sha TEXT,
     message TEXT,
     diff TEXT
     );""")
conn.commit()
cursor.close()
conn.close()

In [None]:
def add_commit(commit: RawData):
    conn = db_conn()
    cursor = conn.cursor()
    cursor.execute(f"""INSERT INTO raw_data (full_path, timestamp, sha, message, diff) VALUES (%s, %s, %s, %s, %s)""",
                   (commit.full_path, commit.timestamp, commit.sha, commit.message, commit.diff))
    conn.commit()
    cursor.close()

In [None]:
def get_raw_data(repo_path: str, cutoff_date: datetime, batch_size: int = 100) -> None:
    if not os.path.exists(os.path.join(repo_path, '.git')):
        print(f"Skipping non-Git directory: {repo_path}")
        return

    try:
        # Checks if the repo has any commits
        subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=repo_path)

        process = subprocess.Popen(
            ["git", "log", "--pretty=format:%H<<DELIM>>%ct<<DELIM>>%s", "--patch", f"--until={cutoff_date.timestamp()}"],
            cwd=repo_path,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )

        commit_info = []
        for line in process.stdout:
            commit_info.append(line.decode('utf-8', errors='replace'))

        process.wait()

        commit_info = ''.join(commit_info).split('\n\n')

        batch_data = []
        batch_index = 0

        for entry in commit_info:
            if entry:
                parts = entry.split('<<DELIM>>')
                if len(parts) < 3:
                    print(f"Skipping malformed entry: {entry}")
                    continue

                sha, timestamp, message = parts[:3]
                commit_datetime = datetime.fromtimestamp(int(timestamp), tz=pytz.utc)

                diff = '\n'.join(parts[3:]).strip()

                raw_data = RawData(
                    full_path=repo_path,
                    timestamp=commit_datetime.isoformat(),
                    sha=sha,
                    message=message,
                    diff=diff
                )
                

                batch_data.append(raw_data)
                if len(batch_data) >= batch_size:
                    save_data_in_batches(batch_data, batch_index)
                    batch_data = []  # Reset batch

        # Save any remaining data in the last batch
        if batch_data:
            save_data_in_batches(batch_data, batch_index)
            batch_index += 1
        else:
            print(f"Error processing {repo_path}: {entry}")

    except subprocess.CalledProcessError as e:
        print(f"Error processing {repo_path}: {e}")
    return counter

In [None]:
def get_all_repos_raw_data(parent_folder: str) -> None:
    """
    Processes all repositories in a parent folder and gathers RawData for each commit.

    Args:
    parent_folder (str): The path to the folder containing all repositories.

    Returns:
    None
    """
    repo_paths = []
    
    for sub_dir in os.listdir(parent_folder):
        sub_dir_path = os.path.join(parent_folder, sub_dir)
        if os.path.isdir(sub_dir_path):
            for repo_dir in os.listdir(sub_dir_path):
                repo_dir_path = os.path.join(sub_dir_path, repo_dir)
                if os.path.isdir(repo_dir_path) and os.path.exists(os.path.join(repo_dir_path, '.git')):
                    repo_paths.append(repo_dir_path)

    counter = 0
    with ThreadPoolExecutor(max_workers=7) as executor:
        future_to_repo = {executor.submit(get_raw_data, repo, datetime(2024, 9, 19, tzinfo=pytz.UTC)): repo for repo in repo_paths}
        for future in tqdm(as_completed(future_to_repo), total=len(future_to_repo), desc="Processing Repositories"):
            counter += future.result()
            
    free_memory()
    print(counter)

In [None]:
repos = pd.read_csv('../code_samples.csv', skiprows=1)
repos = repos.dropna(subset=['html_url'])

In [None]:
for i in tqdm(range(len(repos)), desc=f"Downloading Repositories"):
    repo = repos.iloc[i]
    repo_ecosystem = repo['html_url'].split('/')[-2]
    repo_name = repo['name']
    sample_name = f"{repo_ecosystem}/{repo_name}"
    download(sample_name)

In [None]:
all_raw_data = get_all_repos_raw_data(os.path.join('repositories'))

In [None]:
with open('raw_data_batch_1.pkl', 'rb') as p:
    batch_data = pickle.load(p)
    print(batch_data[18])
    