In [1]:
%pip install nest_asyncio
%pip install prettytable

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [2]:
# Example usage
repository_url = "https://github.com/Eemayas/Daraz_Scraper"
repo_clone=True

In [3]:
import nest_asyncio
nest_asyncio.apply()

from dataclasses import dataclass
from typing import Any, Optional, List
import aiohttp
import asyncio
from prettytable import PrettyTable

@dataclass
class Contributor:
    name: str
    profile_url: str
    avatar_url: str

@dataclass
class RepositoryMetadata:
    name: str
    full_name: str
    owner: str
    owner_url: Optional[str]
    description: Optional[str]
    stars_count: int
    forks_count: int
    watchers_count: int
    open_issues_count: int
    default_branch: str
    created_at: str
    updated_at: str
    pushed_at: str
    size_kb: int
    clone_url_http: str
    clone_url_ssh: str
    contributors_url: Optional[str]
    languages_url: str
    issues_url: Optional[str]
    language: Optional[str]
    languages: List[str]
    topics: List[str]
    has_wiki: bool
    has_issues: bool
    has_projects: bool
    is_private: bool
    homepage_url: Optional[str]
    license_name: Optional[str]
    license_url: Optional[str]
    contributors: List[Contributor]

def _parse_repository_metadata(repo_data: dict, contributors: List[Contributor]) -> RepositoryMetadata:
    languages = repo_data.get("languages", {})
    license_info = repo_data.get("license", {}) or {}
    owner_info = repo_data.get("owner", {}) or {}

    return RepositoryMetadata(
        name=repo_data.get("name", ""),
        full_name=repo_data.get("full_name", ""),
        owner=owner_info.get("login", ""),
        owner_url=owner_info.get("html_url", ""),
        description=repo_data.get("description", ""),
        stars_count=repo_data.get("stargazers_count", 0),
        forks_count=repo_data.get("forks_count", 0),
        watchers_count=repo_data.get("watchers_count", 0),
        open_issues_count=repo_data.get("open_issues_count", 0),
        default_branch=repo_data.get("default_branch", ""),
        created_at=repo_data.get("created_at", ""),
        updated_at=repo_data.get("updated_at", ""),
        pushed_at=repo_data.get("pushed_at", ""),
        size_kb=repo_data.get("size", 0),
        clone_url_http=repo_data.get("clone_url", ""),
        clone_url_ssh=repo_data.get("ssh_url", ""),
        contributors_url=repo_data.get("contributors_url"),
        languages_url=repo_data.get("languages_url", ""),
        issues_url=repo_data.get("issues_url"),
        language=repo_data.get("language", ""),
        languages=list(languages.keys()) if languages else [],
        topics=repo_data.get("topics", []),
        has_wiki=repo_data.get("has_wiki", False),
        has_issues=repo_data.get("has_issues", False),
        has_projects=repo_data.get("has_projects", False),
        is_private=repo_data.get("private", False),
        homepage_url=repo_data.get("homepage", ""),
        license_name=license_info.get("name", ""),
        license_url=license_info.get("url", ""),
        contributors=contributors
    )

async def _fetch_repository_metadata(session: aiohttp.ClientSession, url: str) -> dict[str, Any]:
    async with session.get(url) as response:
        response.raise_for_status()
        return await response.json()

async def _fetch_contributors(session: aiohttp.ClientSession, url: str) -> List[Contributor]:
    async with session.get(url) as response:
        response.raise_for_status()
        contributors_data = await response.json()
        return [
            Contributor(
                name=contributor.get("login", ""),
                profile_url=contributor.get("html_url", ""),
                avatar_url=contributor.get("avatar_url", "")
            )
            for contributor in contributors_data
        ]

async def fetch_git_repository_metadata(session: aiohttp.ClientSession, repository_url: str) -> Optional[RepositoryMetadata]:
    api_url = repository_url.replace("https://github.com/", "https://api.github.com/repos/")

    try:
        metadata = await _fetch_repository_metadata(session, api_url)
        contributors_url = metadata.get("contributors_url", "")
        contributors = await _fetch_contributors(session, contributors_url) if contributors_url else []
        return _parse_repository_metadata(metadata, contributors) if metadata else None
    except aiohttp.ClientError as exc:
        print(f"Client error while fetching repository metadata: {exc}")
        return None

def print_metadata(metadata: RepositoryMetadata):
    table = PrettyTable()
    table.field_names = ["Attribute", "Value"]

    table.add_row(["Name", metadata.name])
    table.add_row(["Full Name", metadata.full_name])
    table.add_row(["Owner", metadata.owner])
    table.add_row(["Owner URL", metadata.owner_url])
    table.add_row(["Description", metadata.description])
    table.add_row(["Stars Count", metadata.stars_count])
    table.add_row(["Forks Count", metadata.forks_count])
    table.add_row(["Watchers Count", metadata.watchers_count])
    table.add_row(["Open Issues Count", metadata.open_issues_count])
    table.add_row(["Default Branch", metadata.default_branch])
    table.add_row(["Created At", metadata.created_at])
    table.add_row(["Updated At", metadata.updated_at])
    table.add_row(["Pushed At", metadata.pushed_at])
    table.add_row(["Size (KB)", metadata.size_kb])
    table.add_row(["Clone URL (HTTP)", metadata.clone_url_http])
    table.add_row(["Clone URL (SSH)", metadata.clone_url_ssh])
    table.add_row(["Contributors URL", metadata.contributors_url])
    table.add_row(["Languages URL", metadata.languages_url])
    table.add_row(["Issues URL", metadata.issues_url])
    table.add_row(["Language", metadata.language])
    table.add_row(["Languages", ", ".join(metadata.languages)])
    table.add_row(["Topics", ", ".join(metadata.topics)])
    table.add_row(["Has Wiki", metadata.has_wiki])
    table.add_row(["Has Issues", metadata.has_issues])
    table.add_row(["Has Projects", metadata.has_projects])
    table.add_row(["Is Private", metadata.is_private])
    table.add_row(["Homepage URL", metadata.homepage_url])
    table.add_row(["License Name", metadata.license_name])
    table.add_row(["License URL", metadata.license_url])

    print(table)

    if metadata.contributors:
        contributors_table = PrettyTable()
        contributors_table.field_names = ["Contributor Name", "Profile URL", "Avatar URL"]
        for contributor in metadata.contributors:
            contributors_table.add_row([contributor.name, contributor.profile_url, contributor.avatar_url])
        print(contributors_table)

async def main(repository_url: str):
    async with aiohttp.ClientSession() as session:
        metadata = await fetch_git_repository_metadata(session, repository_url)
        if metadata:
            print_metadata(metadata)

await main(repository_url)

+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|     Attribute     |                                                                              Value                                                                              |
+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|        Name       |                                                                          Daraz_Scraper                                                                          |
|     Full Name     |                                                                      Eemayas/Daraz_Scraper                                                                      |
|       Owner       |                                                           

In [5]:
ignore_list = [
    # General
    '.git',            # Git repository metadata
    'node_modules',    # Node.js modules
    '.idea',           # JetBrains IDE project files
    '.vscode',         # Visual Studio Code settings
    '__pycache__',     # Python bytecode cache
    '.DS_Store',       # macOS directory metadata
    '.env',            # Environment variable files
    'venv',            # Python virtual environment
    'build',           # Build output directories
    'dist',            # Distribution directories
    'target',          # Output from Java and Rust builds
    '.pytest_cache',   # Pytest cache files
    '*.log',           # Log files
    '*.tmp',           # Temporary files

    # Python
    '*.pyc',           # Compiled Python files
    '.mypy_cache',     # Mypy type checker cache
    '.tox',            # Tox environment

    # JavaScript/Node.js
    'npm-debug.log',   # NPM debug logs
    'yarn-error.log',  # Yarn error logs
    '.parcel-cache',   # Parcel bundler cache
    'coverage',        # Code coverage reports
    '.next',           # Next.js build directory
    'out',             # Output directory for Next.js

    # Java
    '*.class',         # Compiled Java classes
    '*.jar',           # JAR files
    '*.war',           # WAR files
    '.settings',       # Eclipse settings
    '.classpath',      # Eclipse classpath
    '.project',        # Eclipse project file

    # C/C++
    '*.o',             # Object files
    '*.a',             # Static libraries
    '*.so',            # Shared libraries
    '*.out',           # Executable files
    '*.exe',           # Windows executables
    'CMakeFiles',      # CMake build files
    'CMakeCache.txt',  # CMake cache
    '*.dSYM',          # macOS debug symbols
    '*.pdb',           # Windows debug symbols

    # Rust
    '*.rlib',          # Rust libraries
    'Cargo.lock',      # Cargo lock file

    # Go
    'bin',             # Binary output directory
    'pkg',             # Package output directory
    '*.test',          # Go test binaries
    'vendor',          # Vendor directory (if not used)

    # Ruby
    '.bundle',         # Bundler directory
    'vendor/bundle',   # Bundled gems
    'log',             # Log files
    'tmp',             # Temporary files
    '.gem',            # RubyGems metadata

    # PHP
    'vendor',          # Composer dependencies
    '.phpunit.result.cache',  # PHPUnit result cache

    # Android
    '.gradle',         # Gradle files
    '*.apk',           # Android package
    '*.ap_ ',          # Android resources package
    'local.properties', # Android SDK settings

    # .NET/C#
    'bin',             # Binary output directory
    'obj',             # Object files directory
    '*.dll',           # DLL files
    '*.user',          # User settings
    'packages',        # NuGet packages

    # LaTeX
    '*.aux',           # Auxiliary files
    '*.toc',           # Table of contents
    '*.out',           # Auxiliary output files
    '*.synctex.gz',    # SyncTeX file
    '*.fls',           # LaTeX build files
    '*.fdb_latexmk',   # LaTeX build files
]


In [6]:
import subprocess
import os
from pathlib import Path
from typing import List, Optional
import asyncio

def print_folder_structure(dir_path: Path, level: int = -1, limit_to_directories: bool = False, length_limit: int = 1000, ignore_list: List[str] = None) -> List[str]:
    """Generate a visual tree structure of the directory contents.

    Args:
        dir_path (Path): The root directory to start the tree from.
        level (int, optional): The depth of recursion. Defaults to -1 (no limit).
        limit_to_directories (bool, optional): If True, only directories are listed. Defaults to False.
        length_limit (int, optional): Limits the number of lines output. Defaults to 1000.
        ignore_list (List[str], optional): A list of directory or file names to ignore. Defaults to None.

    Returns:
        List[str]: A list of strings representing the directory tree structure.
    """ 
    space = '    '
    branch = '│   '
    tee = '├── '
    last = '└── '
    dir_path = Path(dir_path)  # Ensure dir_path is a Path object
    files = 0
    directories = 0
    output = []

    if ignore_list is None:
        ignore_list = []

    def inner(dir_path: Path, prefix: str = '', level: int = -1):
        nonlocal files, directories
        if level == 0:
            return  # Stop recursion if level is 0
        if limit_to_directories:
            contents = [d for d in dir_path.iterdir() if d.is_dir() and d.name not in ignore_list]
        else:
            contents = [d for d in dir_path.iterdir() if d.name not in ignore_list]
        pointers = [tee] * (len(contents) - 1) + [last]
        for pointer, path in zip(pointers, contents):
            if path.is_dir():
                output.append(prefix + pointer + path.name + "/")
                directories += 1
                extension = branch if pointer == tee else space
                inner(path, prefix=prefix + extension, level=level - 1)
            elif not limit_to_directories:
                output.append(prefix + pointer + path.name)
                files += 1

    # Add the root directory name
    output.append(dir_path.name + "/")
    # Create an iterator from the inner function
    inner(dir_path, level=level)
    # Limit the output by length_limit
    if len(output) > length_limit:
        output = output[:length_limit]
        output.append(f'... length_limit, {length_limit}, reached, counted:')
    # Add the summary of directories and files
    output.append(f'\n{directories} directories' + (f', {files} files' if files else ''))

    return output

async def clone_github_repo(repository_url: str) -> Optional[str]:
    repo_name = repository_url.split('/')[-1]

    if not os.path.exists(repo_name):
        print(f"Cloning repository from {repository_url}...")
        try:
            subprocess.run(['git', 'clone', repository_url], check=True)
            print(f"Repository cloned into {repo_name}/")
            return repo_name
        except subprocess.CalledProcessError as e:
            print(f"Error cloning repository: {e}")
            return None
    else:
        print(f"Repository folder '{repo_name}' already exists. Skipping clone.")
        return repo_name

# Clone the repository
repo_name = await clone_github_repo(repository_url=repository_url)

if repo_name:
    # Print the folder structure
    folder_structure = print_folder_structure(
        dir_path=Path(repo_name),
        ignore_list=ignore_list
    )
    folder_structure_str = "\n".join(folder_structure)
    print(folder_structure_str)
else:
    print("Repository cloning failed or was skipped.")

Repository folder 'Daraz_Scraper' already exists. Skipping clone.
Daraz_Scraper/
├── .env.local.copy
├── .gitignore
├── app/
│   ├── api/
│   │   └── cron/
│   │       └── route.ts
│   ├── favicon.ico
│   ├── globals.css
│   ├── layout.tsx
│   ├── page.tsx
│   └── products/
│       └── [id]/
│           └── page.tsx
├── components/
│   ├── HeroCarousel.tsx
│   ├── Modal.tsx
│   ├── Navbar.tsx
│   ├── PriceInfoCard.tsx
│   ├── ProductCard.tsx
│   └── Searchbar.tsx
├── lib/
│   ├── action/
│   │   └── index.ts
│   ├── models/
│   │   └── product.model.ts
│   ├── mongoose.ts
│   ├── nodemailer/
│   │   └── index.ts
│   ├── scrapper/
│   │   └── index.ts
│   └── utils.ts
├── next.config.js
├── package-lock.json
├── package.json
├── postcss.config.js
├── public/
│   ├── assets/
│   │   ├── icons/
│   │   │   ├── arrow-down.svg
│   │   │   ├── arrow-right.svg
│   │   │   ├── arrow-up.svg
│   │   │   ├── bag.svg
│   │   │   ├── black-heart.svg
│   │   │   ├── bookmark.svg
│   │   │   ├── char

In [7]:
import os
from pathlib import Path
from prettytable import PrettyTable

def generate_summary(path: Path, ignore_list: List[str]) -> list[dict[str, str]]:
    """Generate a summary of files in the given path."""
    summary = []
    for root, dirs, files in os.walk(path):
        # Get relative path for the current directory
        relative_root = os.path.relpath(root, path)

        if relative_root == '.':
            summary.append({"file": "Modules", "description": "."})
        else:
            summary.append({"file": relative_root, "description": ""})

        # List files in the current directory
        for file in files:
            file_path = Path(root) / file
            if file_path.name not in ignore_list:
                summary.append({"file": file, "description": f"HTTP error 401 for prompt {file_path}"})

    return summary



def generate_summaryv2(path: Path, ignore_list: List[str], qa_component: SummaryQA) -> List[Dict[str, str]]:
    """Generate a summary of files in the given path using the model."""
    summary = []

    for root, dirs, files in os.walk(path):
        # Get relative path for the current directory
        relative_root = os.path.relpath(root, path)

        # Check if the directory should be ignored
        if any(ignored in relative_root.split(os.sep) for ignored in ignore_list):
            continue

        # Add the directory to the summary
        if relative_root == '.':
            summary.append({"file": "Modules", "description": "."})
        else:
            summary.append({"file": relative_root, "description": ""})
            
        # Process files in the current directory
        for file in files:
            file_path = Path(root) / file
            
            # Check if the file should be ignored
            if any(ignored in file_path.parts for ignored in ignore_list):
                continue
            
            summary.append({"file": file, "description": f"HTTP error 401 for prompt {file_path}"})

    return summary


if repo_name:
    path = Path(repo_name)
    if not path.is_dir():
        print(f"The path {path} is not a directory.")
    summary = generate_summary(path,ignore_list=ignore_list)
    table = PrettyTable()
    table.field_names = ["File", "Description"]

    for item in summary:
        table.add_row([item["file"], item["description"]])

    print(table)
else:
    print("Repository cloning failed or was skipped.")


+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------+
|                        File                        |                                                 Description                                                  |
+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------+
|                      Modules                       |                                                      .                                                       |
|                  .env.local.copy                   |                           HTTP error 401 for prompt Daraz_Scraper\.env.local.copy                            |
|                     .gitignore                     |                              HTTP error 401 for prompt Daraz_Scraper\.gitignore                              |
|   