In [1]:
%pip install GitPython pandas tqdm pathlib

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\oheit\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
from git import Repo
import os
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import pytz
from dataclasses import dataclass
import pickle
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
def clone(gitUrl: str, repoDir: str, sample: str) -> None:
    '''Clone a git repository and checkout all files in the repository
    
    Args:
    gitUrl (str): URL of the git repository
    repoDir (str): Directory to clone the repository to
    sample (str): Name of the sample
        
    Returns:
        None'''
    repo_path = os.path.join(repoDir, sample)
    os.makedirs(repo_path, exist_ok=True)

    repo = Repo.clone_from(gitUrl, repo_path, multi_options=["--no-checkout"])

    try:
        repo.git.reset('--hard', 'HEAD') # Reset the working tree to HEAD

        repo.git.checkout('--', '.') # Partial checkout in batches
    except Exception as e:
        print(f"Error checking out files for {sample}: {e}")

In [4]:
def download(sample: str) -> None:
    '''Download the repository
    
    Args:
    sample (str): Name of the sample
    
    Returns:
        None'''
    gitHubUrl = f"https://github.com/{sample}.git"
    repoDir = "repositories/"
    isdir = os.path.isdir(repoDir+sample)
    if isdir:
        return
    else:
        clone(gitHubUrl, repoDir, sample)

In [5]:
@dataclass
class RawData:
    full_path: str
    timestamp: datetime
    sha: str
    message: str
    diff: str
    
    def __str__(self):
        return f"{self.timestamp}\n {self.sha}\n {self.message}\n {self.diff}"
    
    def __repr__(self):
        return self.__str__()

In [6]:
import subprocess

def get_raw_data(repo_path: str, cutoff_date: datetime) -> List[RawData]:
    """Generates a list of RawData objects for each commit in a Git repository.

    Args:
    repo_path (str): The path to the repository.
    cutoff_date (datetime): The cutoff date for commits.

    Returns:
        List[RawData]: A list of RawData objects for each commit.
    """
    raw_data_list = []

    if not os.path.exists(os.path.join(repo_path, '.git')):
        print(f"Skipping non-Git directory: {repo_path}")
        return raw_data_list

    try:
        # Check if the repository has any commits
        subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=repo_path)

        commit_info = subprocess.check_output(
            ["git", "log", "--pretty=format:%H|%ct|%s", "--patch"],
            cwd=repo_path
        ).decode("utf-8", errors="ignore").split('\n\ncommit ')

        for entry in commit_info:
            if entry:
                sha, timestamp, message = entry.split('|', 2)
                commit_datetime = datetime.fromtimestamp(int(timestamp), tz=pytz.utc)
                if (cutoff_date < commit_datetime):
                    continue
                
                diff = subprocess.check_output(
                    ["git", "diff", f"{sha}^", sha],
                    cwd=repo_path
                ).decode("utf-8", errors="ignore")

                raw_data = RawData(
                    full_path=repo_path,
                    timestamp=commit_datetime.isoformat(),
                    sha=sha,
                    message=message,
                    diff=diff
                )
                raw_data_list.append(raw_data)

    except subprocess.CalledProcessError as e:
        print(f"Error processing {repo_path}: {e}")

    return raw_data_list


In [7]:
def get_all_repos_raw_data(parent_folder: str) -> List[RawData]:
    """
    Processes all repositories in a parent folder and gathers RawData for each commit.

    Args:
    parent_folder (str): The path to the folder containing all repositories.

    Returns:
    List[RawData]: A combined list of RawData objects from all repositories.
    """
    all_raw_data = []

    repo_paths = []
    for sub_dir in os.listdir(parent_folder):
        sub_dir_path = os.path.join(parent_folder, sub_dir)
        if os.path.isdir(sub_dir_path):
            for repo_dir in os.listdir(sub_dir_path):
                repo_dir_path = os.path.join(sub_dir_path, repo_dir)
                if os.path.isdir(repo_dir_path) and os.path.exists(os.path.join(repo_dir_path, '.git')):
                    repo_paths.append(repo_dir_path)

    with ThreadPoolExecutor(max_workers=12) as executor:
        future_to_repo = {executor.submit(get_raw_data, repo, datetime(2024, 9, 19, tzinfo=pytz.UTC)): repo for repo in repo_paths}
        for future in tqdm(as_completed(future_to_repo), total=len(future_to_repo), desc="Processing repositories"):
            repo_data = future.result()
            all_raw_data.extend(repo_data)
    all_raw_data.sort(key=lambda x: x.sha)

    return all_raw_data

In [8]:
repos = pd.read_csv('../code_samples.csv', skiprows=1)
repos = repos.dropna(subset=['html_url'])

In [9]:
for i in tqdm(range(len(repos)), desc="Downloading Repositories"):
    repo = repos.iloc[i]
    repo_ecosystem = repo['html_url'].split('/')[-2]
    repo_name = repo['name']
    sample_name = f"{repo_ecosystem}/{repo_name}"
    download(sample_name)


Downloading Repositories: 100%|██████████| 343/343 [00:00<00:00, 6883.84it/s]


In [10]:
raw_data = {}
all_raw_data = get_all_repos_raw_data(os.path.join('repositories'))
raw_data = {rd.sha: rd for rd in all_raw_data}

Processing repositories:  16%|█▋        | 56/343 [00:04<00:17, 16.37it/s]

Error processing repositories\aws-samples\aws-lambda-java-workshop: Command '['git', 'rev-parse', 'HEAD']' returned non-zero exit status 128.


Processing repositories: 100%|██████████| 343/343 [01:12<00:00,  4.75it/s]


In [11]:
p = open('raw_data.pkl', 'wb')
pickle.dump(raw_data, p)
p.close()

In [12]:
with open('raw_data.pkl', 'rb') as f:
    specific_commit = raw_data.get('6367324cfa1e7154a507e5dbdbb313399b5272b5')
    print(specific_commit.diff)

diff --git a/Asset/ArmTemplate.json b/Asset/ArmTemplate.json
deleted file mode 100644
index 57d234c..0000000
--- a/Asset/ArmTemplate.json
+++ /dev/null
@@ -1,72 +0,0 @@
-﻿{
-  "$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
-  "contentVersion": "1.0.0.0",
-  "parameters": {
-    "hostingPlanName": {
-      "type": "string",
-      "defaultValue": ""
-    },
-    "skuName": {
-      "type": "string",
-      "defaultValue": ""
-    },
-    "skuCapacity": {
-      "type": "int",
-      "defaultValue": 0
-    },
-    "webSiteName": {
-      "type": "string",
-      "defaultValue": ""
-    }
-  },
-  "resources": [
-    {
-      "apiVersion": "2015-08-01",
-      "name": "[parameters('hostingPlanName')]",
-      "type": "Microsoft.Web/serverfarms",
-      "location": "[resourceGroup().location]",
-      "tags": {
-        "displayName": "HostingPlan"
-      },
-      "sku": {
-        "name": "[parameters('skuName')]",
-        "capacity": "[para