In [2]:
from git.repo import Repo
from git.diff import Diff
from os.path import exists

def get_repo(repo_url: str) -> Repo:
  repo_path = f"repos/{repo_url.split('/')[-1]}"
  return Repo(repo_path) if exists(repo_path) else Repo.clone_from(repo_url, repo_path)

def sync(repo: Repo) -> Repo:
  repo.remote().pull()
  return repo

def list_files(repo_url):
  # Clone the repository to a local directory
  repo = get_repo(repo_url)
  sync(repo)
  # for tree in repo.iter_trees():
  #   print(tree)
  for commit in repo.iter_commits():
    diffs: list[Diff]  = commit.diff(repo.tree())
    if diffs:
      print(diffs[0].diff)
      break
  # List all the files in the repository
  tree = repo.git.ls_tree("--full-tree", "-r", "HEAD")
  print(tree)
  files = []
  for line in tree.split("\n"):
    # Extract the file name from the output of `git ls-tree`
    parts = line.split()
    if len(parts) > 1:
      files.append(parts[-1])

  return files

# Example usage:
list_files("https://github.com/Foxicution/trello-weekday-list-automation")


100644 blob 757f2edf33314760a29ee426396b59dc48919e73	.idea/.gitignore
100644 blob 105ce2da2d6447d11dfe32bfb846c3d5b199fc99	.idea/inspectionProfiles/profiles_settings.xml
100644 blob 5c9b14c28fb8bae86ce1a3034078b360d02372e1	.idea/misc.xml
100644 blob ed6c156a7ad837cf2c5e67f15a7ff3327265ea0b	.idea/modules.xml
100644 blob 74d515a027de98657e9d3d5f0f1831882fd81374	.idea/trello_list_namer.iml
100644 blob 94a25f7f4cb416c083d265558da75d457237d671	.idea/vcs.xml
100644 blob 6d34db6d8622f44b2569514d21342cb06e19b7ff	LICENSE
100644 blob dd78833c1c066f15c1e3b526bf740e27fc1e9233	README.md
100644 blob f4719b62bb03b4cb7b08351bf831eabe39c703e3	building/build.py
100644 blob 29a2646a764394dc4d267fce3e0cb7b370753c5d	main.py
100644 blob 008aa62795e6a7a43ebe3448d82a5e8d51238121	requirements.txt
100644 blob 57ae357cfe2f2d1e82a83af1c02b4fed5ce73e3d	structs.py
100644 blob a1d674a012674d2b2d37c3735d250e51f521da04	testing/run_tests.bat
100644 blob 92f40cf24bbdc624da3f97f7e556640e84b7fd3b	testing/run_tests.sh
100

['.idea/.gitignore',
 '.idea/inspectionProfiles/profiles_settings.xml',
 '.idea/misc.xml',
 '.idea/modules.xml',
 '.idea/trello_list_namer.iml',
 '.idea/vcs.xml',
 'LICENSE',
 'README.md',
 'building/build.py',
 'main.py',
 'requirements.txt',
 'structs.py',
 'testing/run_tests.bat',
 'testing/run_tests.sh',
 'testing/test.py',
 'testing/test_cases.py']

# Repository processing module

## Repository link processing and encoding

### Encoding
- As seen in the results this takes almost no time ~1k ns = 1e-6 s
- Using % encoding is a very common approach
- The algorithm might become faster in the future as it is very widely used

In [25]:
class timing:
    """Object decorator that times the average execution time of a function"""

    def __init__(self, func):
        self.func = func
        self.times = []
        self.avg_time = 0

    def __call__(self, *args, **kwargs):
        import time

        start = time.perf_counter_ns()
        result = self.func(*args, **kwargs)
        end = time.perf_counter_ns()
        self.times.append(end - start)
        self.avg_time = sum(self.times) / len(self.times)
        return result


def random_string(string_length: int = 10) -> str:
    """Function that generates a random string of fixed length"""
    from random import choice
    from string import ascii_lowercase

    letters = ascii_lowercase
    return "".join(choice(letters) for i in range(string_length))


from urllib.parse import quote


@timing
def to_url(string: str) -> str:
    """Function that converts a string to a url using % encoding"""
    return quote(string, safe="")


from toolz.functoolz import pipe

for i in range(1000):
    pipe(20, random_string, to_url)

print(f"{to_url.avg_time:0.0f} ns")


1056 ns


### Formatting, storing and decoding
- The full encoding and decoding pipeline

In [16]:
from urllib.parse import quote, unquote


# Functions for encoding/decoding strings to urls
def to_url(string: str) -> str:
    """Function that converts a string to a url using % encoding"""
    return quote(string, safe="")


def from_url(url: str) -> str:
    """Function that converts a url to a string using % decoding"""
    return unquote(url)


# Function for storing files
def store(file_name: str) -> str:
    """Temporary represantation of storage function by printing the file name"""
    print(f"File is stored as: {file_name}")
    return file_name


original_name = "Foxicution/repo-review"
print(f"Original name: {original_name}")
decoded_name = pipe(original_name, to_url, store, from_url)
print(f"Decoded name: {decoded_name}")


Original name: Foxicution/repo-review
File is stored as: Foxicution%2Frepo-review
Decoded name: Foxicution/repo-review


### Storage, cloning and pull pipeline
- Using GitPython for speed instead of PyGithub
- Benchmarking (pulling is more important)
    - pulling ~ 0.5 s, cloning ~ 7 s
    - reading a repository from memory is quite quick and the rest depend on the internet speed mostly
    - Not sure how much overhead python introduces here
- Overall happy with the results here

In [29]:
from git.repo import Repo
from os.path import exists


def format_repo_url(repo_url: str) -> str:
    """Function that removes the storage system domain from the repo url"""
    return repo_url.split("github.com/")[-1]


# Check if the formatting works as expected
repo_url = "https://github.com/Foxicution/repo-review"
repo_path = f"repos/{pipe(repo_url, format_repo_url, to_url)}"
print(repo_path)
print(from_url(repo_path))


def pull_repo(repo_path) -> Repo:
    """Function that pulls the latest changes from a repository in a local directory"""
    repo = Repo(repo_path)
    repo.remote().pull()
    return repo


@timing
def get_repo(repo_url: str) -> Repo:
    """Function that clones a repository to a local directory"""
    repo_path = f"repos/{pipe(repo_url, format_repo_url, to_url)}"
    return pull_repo(repo_path) if exists(repo_path) else Repo.clone_from(repo_url, repo_path)


get_repo(repo_url)
print(
    f"Cloning the repo took {get_repo.times[-1]:0.0f} ns = {get_repo.times[-1] / 1e9:0.2f} s"
)
get_repo(repo_url)
print(
    f"Pulling the repo took {get_repo.times[-1]:0.0f} ns = {get_repo.times[-1] / 1e9:0.2f} s"
)
large_repo_url = "https://github.com/python/mypy"
get_repo(large_repo_url)
print(
    f"Cloning the large repo took {get_repo.times[-1]:0.0f} ns = {get_repo.times[-1] / 1e9:0.2f} s"
)
get_repo(repo_url)
print(
    f"Pulling the large repo took {get_repo.times[-1]:0.0f} ns = {get_repo.times[-1] / 1e9:0.2f} s"
)


repos/Foxicution%2Frepo-review
repos/Foxicution/repo-review
Cloning the repo took 7288906527 ns = 7.29 s
Pulling the repo took 408921482 ns = 0.41 s
Cloning the large repo took 7054021202 ns = 7.05 s
Pulling the large repo took 528728527 ns = 0.53 s
