In [1]:
collab=0
if collab:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/ReadMe

# Ollama Step-Up

In [2]:
from IPython.display import clear_output
!sudo apt-get install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh # download ollama api
!ollama pull llama3.1:8b
!pip install -U lightrag[ollama]

clear_output()

In [3]:
# Create a Python script to start the Ollama API server in a separate thread

import os
import threading
import subprocess
import requests
import json

def ollama():
    if collab:
        os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
        os.environ['OLLAMA_ORIGINS'] = '*'
        subprocess.Popen(["ollama", "serve"])
    else:
        os.environ['OLLAMA_HOST'] = '127.0.0.1:11434'
        os.environ['OLLAMA_ORIGINS'] = '*'
        subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()
clear_output()

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()
clear_output()

In [4]:
from lightrag.core.generator import Generator
from lightrag.core.component import Component
from lightrag.core.model_client import ModelClient
from lightrag.components.model_client import OllamaClient, GroqAPIClient

import time


qa_template = r"""<SYS>
You are a helpful assistant.
</SYS>
User: {{input_str}}
You:"""

class SimpleQA(Component):
    def __init__(self, model_client: ModelClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=qa_template,
        )

    def call(self, input: dict) -> str:
        return self.generator.call({"input_str": str(input)})

    async def acall(self, input: dict) -> str:
        return await self.generator.acall({"input_str": str(input)})



In [5]:
from lightrag.components.model_client import OllamaClient
from IPython.display import Markdown, display
model = {
    "model_client": OllamaClient(),
    "model_kwargs": {"model": "llama3.1:8b"}
}
qa = SimpleQA(**model)
output=qa("what is 2+2")
display(Markdown(f"**Answer:** {output.data}"))

**Answer:** The answer to 2 + 2 is 4. Is there anything else I can help you with?

# D


In [6]:
from IPython.display import clear_output
# %pip install nest_asyncio
# %pip install prettytable
# %pip install tqdm
# %pip install -U lightrag[ollama]
# %pip install aiohttp
# %pip install pandas
# %pip install openpyxl
clear_output()

In [7]:
# Example usage
repository_url = "https://github.com/Eemayas/Daraz_Scraper"

### GitHub MetaData Extraction

In [8]:
import nest_asyncio
nest_asyncio.apply()

from dataclasses import dataclass
from typing import Any, Optional, List
import aiohttp
import asyncio
from prettytable import PrettyTable

@dataclass
class Contributor:
    name: str
    profile_url: str
    avatar_url: str

@dataclass
class RepositoryMetadata:
    name: str
    full_name: str
    owner: str
    owner_url: Optional[str]
    description: Optional[str]
    stars_count: int
    forks_count: int
    watchers_count: int
    open_issues_count: int
    default_branch: str
    created_at: str
    updated_at: str
    pushed_at: str
    size_kb: int
    clone_url_http: str
    clone_url_ssh: str
    contributors_url: Optional[str]
    languages_url: str
    issues_url: Optional[str]
    language: Optional[str]
    languages: List[str]
    topics: List[str]
    has_wiki: bool
    has_issues: bool
    has_projects: bool
    is_private: bool
    homepage_url: Optional[str]
    license_name: Optional[str]
    license_url: Optional[str]
    contributors: List[Contributor]

def _parse_repository_metadata(repo_data: dict, contributors: List[Contributor]) -> RepositoryMetadata:
    languages = repo_data.get("languages", {})
    license_info = repo_data.get("license", {}) or {}
    owner_info = repo_data.get("owner", {}) or {}

    return RepositoryMetadata(
        name=repo_data.get("name", ""),
        full_name=repo_data.get("full_name", ""),
        owner=owner_info.get("login", ""),
        owner_url=owner_info.get("html_url", ""),
        description=repo_data.get("description", ""),
        stars_count=repo_data.get("stargazers_count", 0),
        forks_count=repo_data.get("forks_count", 0),
        watchers_count=repo_data.get("watchers_count", 0),
        open_issues_count=repo_data.get("open_issues_count", 0),
        default_branch=repo_data.get("default_branch", ""),
        created_at=repo_data.get("created_at", ""),
        updated_at=repo_data.get("updated_at", ""),
        pushed_at=repo_data.get("pushed_at", ""),
        size_kb=repo_data.get("size", 0),
        clone_url_http=repo_data.get("clone_url", ""),
        clone_url_ssh=repo_data.get("ssh_url", ""),
        contributors_url=repo_data.get("contributors_url"),
        languages_url=repo_data.get("languages_url", ""),
        issues_url=repo_data.get("issues_url"),
        language=repo_data.get("language", ""),
        languages=list(languages.keys()) if languages else [],
        topics=repo_data.get("topics", []),
        has_wiki=repo_data.get("has_wiki", False),
        has_issues=repo_data.get("has_issues", False),
        has_projects=repo_data.get("has_projects", False),
        is_private=repo_data.get("private", False),
        homepage_url=repo_data.get("homepage", ""),
        license_name=license_info.get("name", ""),
        license_url=license_info.get("url", ""),
        contributors=contributors
    )

async def _fetch_repository_metadata(session: aiohttp.ClientSession, url: str) -> dict[str, Any]:
    async with session.get(url) as response:
        response.raise_for_status()
        return await response.json()

async def _fetch_contributors(session: aiohttp.ClientSession, url: str) -> List[Contributor]:
    async with session.get(url) as response:
        response.raise_for_status()
        contributors_data = await response.json()
        return [
            Contributor(
                name=contributor.get("login", ""),
                profile_url=contributor.get("html_url", ""),
                avatar_url=contributor.get("avatar_url", "")
            )
            for contributor in contributors_data
        ]

async def fetch_git_repository_metadata(session: aiohttp.ClientSession, repository_url: str) -> Optional[RepositoryMetadata]:
    api_url = repository_url.replace("https://github.com/", "https://api.github.com/repos/")

    try:
        metadata = await _fetch_repository_metadata(session, api_url)
        contributors_url = metadata.get("contributors_url", "")
        contributors = await _fetch_contributors(session, contributors_url) if contributors_url else []
        return _parse_repository_metadata(metadata, contributors) if metadata else None
    except aiohttp.ClientError as exc:
        print(f"Client error while fetching repository metadata: {exc}")
        return None

def print_metadata(metadata: RepositoryMetadata):
    table = PrettyTable()
    table.field_names = ["Attribute", "Value"]

    table.add_row(["Name", metadata.name])
    table.add_row(["Full Name", metadata.full_name])
    table.add_row(["Owner", metadata.owner])
    table.add_row(["Owner URL", metadata.owner_url])
    table.add_row(["Description", metadata.description])
    table.add_row(["Stars Count", metadata.stars_count])
    table.add_row(["Forks Count", metadata.forks_count])
    table.add_row(["Watchers Count", metadata.watchers_count])
    table.add_row(["Open Issues Count", metadata.open_issues_count])
    table.add_row(["Default Branch", metadata.default_branch])
    table.add_row(["Created At", metadata.created_at])
    table.add_row(["Updated At", metadata.updated_at])
    table.add_row(["Pushed At", metadata.pushed_at])
    table.add_row(["Size (KB)", metadata.size_kb])
    table.add_row(["Clone URL (HTTP)", metadata.clone_url_http])
    table.add_row(["Clone URL (SSH)", metadata.clone_url_ssh])
    table.add_row(["Contributors URL", metadata.contributors_url])
    table.add_row(["Languages URL", metadata.languages_url])
    table.add_row(["Issues URL", metadata.issues_url])
    table.add_row(["Language", metadata.language])
    table.add_row(["Languages", ", ".join(metadata.languages)])
    table.add_row(["Topics", ", ".join(metadata.topics)])
    table.add_row(["Has Wiki", metadata.has_wiki])
    table.add_row(["Has Issues", metadata.has_issues])
    table.add_row(["Has Projects", metadata.has_projects])
    table.add_row(["Is Private", metadata.is_private])
    table.add_row(["Homepage URL", metadata.homepage_url])
    table.add_row(["License Name", metadata.license_name])
    table.add_row(["License URL", metadata.license_url])

    print(table)

    if metadata.contributors:
        contributors_table = PrettyTable()
        contributors_table.field_names = ["Contributor Name", "Profile URL", "Avatar URL"]
        for contributor in metadata.contributors:
            contributors_table.add_row([contributor.name, contributor.profile_url, contributor.avatar_url])
        print(contributors_table)

async def main(repository_url: str):
    async with aiohttp.ClientSession() as session:
        metadata = await fetch_git_repository_metadata(session, repository_url)
        if metadata:
            print_metadata(metadata)

await main(repository_url)

+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|     Attribute     |                                                                              Value                                                                              |
+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|        Name       |                                                                          Daraz_Scraper                                                                          |
|     Full Name     |                                                                      Eemayas/Daraz_Scraper                                                                      |
|       Owner       |                                                           

### Ignore List

In [9]:
ignore_list = [
    # General
    '.git',            # Git repository metadata
    'node_modules',    # Node.js modules
    '.idea',           # JetBrains IDE project files
    '.vscode',         # Visual Studio Code settings
    '__pycache__',     # Python bytecode cache
    '.DS_Store',       # macOS directory metadata
    '.env',            # Environment variable files
    'venv',            # Python virtual environment
    'build',           # Build output directories
    'dist',            # Distribution directories
    'target',          # Output from Java and Rust builds
    '.pytest_cache',   # Pytest cache files
    '*.log',           # Log files
    '*.tmp',           # Temporary files

    # Python
    '*.pyc',           # Compiled Python files
    '.mypy_cache',     # Mypy type checker cache
    '.tox',            # Tox environment

    # JavaScript/Node.js
    'npm-debug.log',   # NPM debug logs
    'yarn-error.log',  # Yarn error logs
    '.parcel-cache',   # Parcel bundler cache
    'coverage',        # Code coverage reports
    '.next',           # Next.js build directory
    'out',             # Output directory for Next.js

    # Java
    '*.class',         # Compiled Java classes
    '*.jar',           # JAR files
    '*.war',           # WAR files
    '.settings',       # Eclipse settings
    '.classpath',      # Eclipse classpath
    '.project',        # Eclipse project file

    # C/C++
    '*.o',             # Object files
    '*.a',             # Static libraries
    '*.so',            # Shared libraries
    '*.out',           # Executable files
    '*.exe',           # Windows executables
    'CMakeFiles',      # CMake build files
    'CMakeCache.txt',  # CMake cache
    '*.dSYM',          # macOS debug symbols
    '*.pdb',           # Windows debug symbols

    # Rust
    '*.rlib',          # Rust libraries
    'Cargo.lock',      # Cargo lock file

    # Go
    'bin',             # Binary output directory
    'pkg',             # Package output directory
    '*.test',          # Go test binaries
    'vendor',          # Vendor directory (if not used)

    # Ruby
    '.bundle',         # Bundler directory
    'vendor/bundle',   # Bundled gems
    'log',             # Log files
    'tmp',             # Temporary files
    '.gem',            # RubyGems metadata

    # PHP
    'vendor',          # Composer dependencies
    '.phpunit.result.cache',  # PHPUnit result cache

    # Android
    '.gradle',         # Gradle files
    '*.apk',           # Android package
    '*.ap_ ',          # Android resources package
    'local.properties', # Android SDK settings

    # .NET/C#
    'bin',             # Binary output directory
    'obj',             # Object files directory
    '*.dll',           # DLL files
    '*.user',          # User settings
    'packages',        # NuGet packages

    # LaTeX
    '*.aux',           # Auxiliary files
    '*.toc',           # Table of contents
    '*.out',           # Auxiliary output files
    '*.synctex.gz',    # SyncTeX file
    '*.fls',           # LaTeX build files
    '*.fdb_latexmk',   # LaTeX build files
]

# Specify the file extensions to ignore
# Specify the file extensions to ignore
ignore_extensions = [
    # Image formats
    '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.tiff', '.webp', '.heif', '.heic', '.ico', '.raw', '.psd',

    # Audio formats
    '.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma', '.aiff', '.alac', '.pcm',

    # Video formats
    '.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.m4v', '.mpg', '.mpeg', '.3gp', '.ogv', '.rm', '.swf'
]



### Folder Structure

In [10]:
import subprocess
import os
from pathlib import Path
from typing import List, Optional
import asyncio

def print_folder_structure(dir_path: Path, level: int = -1, limit_to_directories: bool = False, length_limit: int = 1000, ignore_list: List[str] = None) -> List[str]:
    """Generate a visual tree structure of the directory contents.

    Args:
        dir_path (Path): The root directory to start the tree from.
        level (int, optional): The depth of recursion. Defaults to -1 (no limit).
        limit_to_directories (bool, optional): If True, only directories are listed. Defaults to False.
        length_limit (int, optional): Limits the number of lines output. Defaults to 1000.
        ignore_list (List[str], optional): A list of directory or file names to ignore. Defaults to None.

    Returns:
        List[str]: A list of strings representing the directory tree structure.
    """
    space = '    '
    branch = '│   '
    tee = '├── '
    last = '└── '
    dir_path = Path(dir_path)  # Ensure dir_path is a Path object
    files = 0
    directories = 0
    output = []

    if ignore_list is None:
        ignore_list = []

    def inner(dir_path: Path, prefix: str = '', level: int = -1):
        nonlocal files, directories
        if level == 0:
            return  # Stop recursion if level is 0
        if limit_to_directories:
            contents = [d for d in dir_path.iterdir() if d.is_dir() and d.name not in ignore_list]
        else:
            contents = [d for d in dir_path.iterdir() if d.name not in ignore_list]
        pointers = [tee] * (len(contents) - 1) + [last]
        for pointer, path in zip(pointers, contents):
            if path.is_dir():
                output.append(prefix + pointer + path.name + "/")
                directories += 1
                extension = branch if pointer == tee else space
                inner(path, prefix=prefix + extension, level=level - 1)
            elif not limit_to_directories:
                output.append(prefix + pointer + path.name)
                files += 1

    # Add the root directory name
    output.append(dir_path.name + "/")
    # Create an iterator from the inner function
    inner(dir_path, level=level)
    # Limit the output by length_limit
    if len(output) > length_limit:
        output = output[:length_limit]
        output.append(f'... length_limit, {length_limit}, reached, counted:')
    # Add the summary of directories and files
    output.append(f'\n{directories} directories' + (f', {files} files' if files else ''))

    return output

async def clone_github_repo(repository_url: str) -> Optional[str]:
    repo_name = repository_url.split('/')[-1]

    if not os.path.exists(repo_name):
        print(f"Cloning repository from {repository_url}...")
        try:
            subprocess.run(['git', 'clone', repository_url], check=True)
            print(f"Repository cloned into {repo_name}/")
            return repo_name
        except subprocess.CalledProcessError as e:
            print(f"Error cloning repository: {e}")
            return None
    else:
        print(f"Repository folder '{repo_name}' already exists. Skipping clone.")
        return repo_name

# Clone the repository
repo_name = await clone_github_repo(repository_url=repository_url)

if repo_name:
    # Print the folder structure
    folder_structure = print_folder_structure(
        dir_path=Path(repo_name),
        ignore_list=ignore_list
    )
    folder_structure_str = "\n".join(folder_structure)
    print(folder_structure_str)
else:
    print("Repository cloning failed or was skipped.")

Repository folder 'Daraz_Scraper' already exists. Skipping clone.
Daraz_Scraper/
├── .env.local.copy
├── .gitignore
├── app/
│   ├── api/
│   │   └── cron/
│   │       └── route.ts
│   ├── favicon.ico
│   ├── globals.css
│   ├── layout.tsx
│   ├── page.tsx
│   └── products/
│       └── [id]/
│           └── page.tsx
├── components/
│   ├── HeroCarousel.tsx
│   ├── Modal.tsx
│   ├── Navbar.tsx
│   ├── PriceInfoCard.tsx
│   ├── ProductCard.tsx
│   └── Searchbar.tsx
├── lib/
│   ├── action/
│   │   └── index.ts
│   ├── models/
│   │   └── product.model.ts
│   ├── mongoose.ts
│   ├── nodemailer/
│   │   └── index.ts
│   ├── scrapper/
│   │   └── index.ts
│   └── utils.ts
├── next.config.js
├── package-lock.json
├── package.json
├── postcss.config.js
├── public/
│   ├── assets/
│   │   ├── icons/
│   │   │   ├── arrow-down.svg
│   │   │   ├── arrow-right.svg
│   │   │   ├── arrow-up.svg
│   │   │   ├── bag.svg
│   │   │   ├── black-heart.svg
│   │   │   ├── bookmark.svg
│   │   │   ├── char

### Summary Generation

In [13]:
import os
from pathlib import Path
from typing import List, Dict
from prettytable import PrettyTable
from lightrag.core.generator import Generator
from lightrag.core.component import Component
from lightrag.components.model_client import OllamaClient
from tqdm import tqdm

summary_template = r"""<SYS>
You are a summarization assistant specialized in coding files.
</SYS>
Please summarize the following code:
{{input_str}}
Summary:"""

class SummaryQA(Component):
    def __init__(self, model_client: OllamaClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=summary_template,
        )

    def call(self, input: str) -> str:
        return self.generator.call({"input_str": input})

    async def acall(self, input: str) -> str:
        return await self.generator.acall({"input_str": input})

def generate_summary(path: Path, ignore_list: List[str], qa_component: SummaryQA, ignore_extensions: List[str]) -> List[Dict[str, str]]:
    """Generate a summary of files in the given path using the model."""
    summary = []
    files_to_process = []

    for root, dirs, files in os.walk(path):
        # Get relative path for the current directory
        relative_root = os.path.relpath(root, path)
        
        # Check if the directory should be ignored
        if any(ignored in relative_root.split(os.sep) for ignored in ignore_list):
            continue

        if relative_root == '.':
            summary.append({"file": "Modules", "description": "."})
        else:
            summary.append({"file": relative_root, "description": "Not a File"})
            
        # List files in the current directory
        for file in files:
            file_path = Path(root) / file

            # Check if the file should be ignored
            if any(ignored in file_path.parts for ignored in ignore_list):
                continue

            # Check if the file has an extension that should be skipped
            if file_path.suffix.lower() in ignore_extensions:
                continue
            
            files_to_process.append(file_path)

    # Use tqdm to display progress
    for file_path in tqdm(files_to_process, desc=f"Processing files-{file_path}", unit="file"):
        try:
            # Read file content
            with open(file_path, 'r') as f:
                file_content = f.read()

            # Generate summary using the model
            summary_text = qa_component.call(file_content)
            summary.append({"file": file_path, "description": summary_text})
        except Exception as e:
            summary.append({"file": file_path, "description": f"Error processing file: {str(e)}"})

    return summary

# Create the QA component
model = {
    "model_client": OllamaClient(),
    "model_kwargs": {"model": "llama3.1:8b"}
}
qa = SummaryQA(**model)



if repo_name:
    path = Path(repo_name)
    if not path.is_dir():
        print(f"The path {path} is not a directory.")
    summary = generate_summary(path, ignore_list=ignore_list, qa_component=qa, ignore_extensions=ignore_extensions)
    table = PrettyTable()
    table.field_names = ["File", "Description"]

    for item in summary:
        table.add_row([item["file"], item["description"]])

    print(table)
else:
    print("Repository cloning failed or was skipped.")


Processing files-Daraz_Scraper\types\index.ts: 100%|██████████| 28/28 [02:16<00:00,  4.88s/file]

+--------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------




In [14]:
import pandas as pd
from pathlib import Path
from prettytable import PrettyTable

# Sample function to check if an object has the `data` attribute
def get_description_data(description):
    if hasattr(description, 'data'):
        return description.data
    return description

if summary:
    # Initialize PrettyTable
    table = PrettyTable()
    table.field_names = ["File", "Description"]

    # Create a list to hold data for the DataFrame
    data_for_excel = []

    for item in summary:
        description_data = get_description_data(item["description"])
        table.add_row([item["file"], description_data])
        data_for_excel.append({"File": item["file"], "Description": description_data})

    # Print the PrettyTable
    print(table)

    # Convert the list to a DataFrame
    df = pd.DataFrame(data_for_excel)

    # Define the path and name for the Excel file
    excel_path = 'summary.xlsx'

    # Save DataFrame to an Excel file
    df.to_excel(excel_path, index=False, engine='openpyxl')
    print(f"Summary saved to {excel_path}")

else:
    print("Repository cloning failed or was skipped.")

+--------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                    File                    |                                                                                                                                                          Description                                                                                                                                                           |
+--------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
from prettytable import PrettyTable

# Sample function to check if an object has the `data` attribute
def get_description_data(description):
    if hasattr(description, 'data'):
        return description.data
    return description

def is_empty_or_error(description):
    description_str = get_description_data(description).strip() if isinstance(description, str) else ""
    return not description_str or "HTTP error 401" in description_str

if summary:
    # Initialize PrettyTable
    table = PrettyTable()
    table.field_names = ["File", "Description"]

    for item in summary:
        description_data = get_description_data(item["description"])
        if is_empty_or_error(description_data):
            table.add_row([item["file"], description_data])

    # Print the PrettyTable
    print(table)

else:
    print("Repository cloning failed or was skipped.")


+------+-------------+
| File | Description |
+------+-------------+
+------+-------------+


In [17]:
def generate_summary_for_file(file_path: Path, qa_component: SummaryQA, existing_summaries: List[Dict[str, str]]) -> List[Dict[str, str]]:
    """Generate or update a summary for a single file using the model."""
    file_name = file_path.name
    updated = False

    # Check if the file summary already exists
    for summary in existing_summaries:
        if summary["file"] == file_name:
            updated = True
            break

    if not updated:
        existing_summaries.append({"file": file_name, "description": ""})

    try:
        # Read file content
        with open(file_path, 'r') as f:
            file_content = f.read()

        # Generate summary using the model
        summary_text = qa_component.call(file_content)

        # Update the summary in the list
        for summary in existing_summaries:
            if summary["file"] == file_name:
                summary["description"] = summary_text
                break
    except Exception as e:
        for summary in existing_summaries:
            if summary["file"] == file_name:
                summary["description"] = f"Error processing file {file_path}: {str(e)}"
                break

    return existing_summaries

# Initialize an empty list to store summaries
summaries = []

# Prompt the user for a file path
user_input = input("Please enter the path to the file you want to summarize: ")
file_path = Path(user_input)

if file_path.is_file():
    # Generate or update the summary for the specified file
    summaries = generate_summary_for_file(file_path, qa, summaries)

    # Print the summary using PrettyTable
    table = PrettyTable()
    table.field_names = ["File", "Description"]

    for summary in summaries:
        table.add_row([summary["file"], summary["description"]])

    print(table)
else:
    print(f"The path {file_path} is not a valid file.")

The path . is not a valid file.


In [20]:

# Combine summaries, ignoring "Not a File" or error messages
combined_summary = " ".join([
    get_description_data(item['description'])
    for item in summary
    if get_description_data(item['description']) and get_description_data(item['description']) != "Not a File" and get_description_data(item['description']) != "." and not get_description_data(item['description']).startswith("HTTP error 401")
])

combined_summary

'**Environment Variables Summary**\n\nThe provided code snippet defines four environment variables:\n\n1. **BRIGHT_DATA_USERNAME**: Set to `brd-customer-hl_e6252875-zone-unblocker`, likely used for authentication with Bright Data.\n2. **BRIGHT_DATA_PASSWORD**: Set to `lkbgf5g6o63c`, probably the corresponding password for Bright Data authentication.\n3. **MONGODB_URI**: Configured as a MongoDB connection string, pointing to a cluster at `cluster0.vejstch.mongodb.net`. It includes authentication details with username `prashantmanandhar2002` and password `Eemayas123`.\n4. **EMAIL_USER** and **EMAIL_PASSWORD**: Set to `prashantmanandhar2002@gmail.com` and `earthisclosedtoday@123`, respectively, likely used for email account credentials.\n\nThese environment variables can be used in a variety of contexts, such as web development projects or scripts that interact with external services. Here is a summary of the code:\n\n**Ignored Files**\n\nThis file contains a list of files and directories

In [21]:
print(combined_summary)

**Environment Variables Summary**

The provided code snippet defines four environment variables:

1. **BRIGHT_DATA_USERNAME**: Set to `brd-customer-hl_e6252875-zone-unblocker`, likely used for authentication with Bright Data.
2. **BRIGHT_DATA_PASSWORD**: Set to `lkbgf5g6o63c`, probably the corresponding password for Bright Data authentication.
3. **MONGODB_URI**: Configured as a MongoDB connection string, pointing to a cluster at `cluster0.vejstch.mongodb.net`. It includes authentication details with username `prashantmanandhar2002` and password `Eemayas123`.
4. **EMAIL_USER** and **EMAIL_PASSWORD**: Set to `prashantmanandhar2002@gmail.com` and `earthisclosedtoday@123`, respectively, likely used for email account credentials.

These environment variables can be used in a variety of contexts, such as web development projects or scripts that interact with external services. Here is a summary of the code:

**Ignored Files**

This file contains a list of files and directories to be ignored

In [16]:
import os
from pathlib import Path
from typing import List, Dict
from prettytable import PrettyTable
from lightrag.core.generator import Generator
from lightrag.core.component import Component
from lightrag.components.model_client import OllamaClient
from tqdm import tqdm

summary_template = r"""<SYS>
You are a summarization assistant specialized in coding files.
</SYS>
Please summarize the following code:
{{input_str}}
Summary:"""

project_overview_template = r"""<SYS>
You are a summarization assistant specialized in project documentation.
</SYS>
Based on the provided file summaries:
{{input_str}}, 

Generate a concise and descriptive one-paragraph overview of the project, including:
1. What the project is about.
2. What the project does.
3. The technologies used.
4. The key features of the project.
5. Detailed instructions on how to run the code, including necessary commands and setup.
Summary:"""

class SummaryQA(Component):
    def __init__(self, model_client: OllamaClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=summary_template,
        )

    def call(self, input: str) -> str:
        return self.generator.call({"input_str": input})

    async def acall(self, input: str) -> str:
        return await self.generator.acall({"input_str": input})
    
class OverviewQA(Component):
    def __init__(self, model_client: OllamaClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=project_overview_template,
        )

    def call(self, summaries: str) -> str:
        return self.generator.call({"input_str": summaries})

    async def acall(self, summaries: str) -> str:
        return await self.generator.acall({"input_str": summaries})

def generate_summary(path: Path, ignore_list: List[str], qa_component: SummaryQA, ignore_extensions: List[str]) -> List[Dict[str, str]]:
    """Generate a summary of files in the given path using the model."""
    summary = []
    files_to_process = []

    for root, dirs, files in os.walk(path):
        # Get relative path for the current directory
        relative_root = os.path.relpath(root, path)
        
        # Check if the directory should be ignored
        if any(ignored in relative_root.split(os.sep) for ignored in ignore_list):
            continue

        if relative_root == '.':
            summary.append({"file": "Modules", "description": "."})
        else:
            summary.append({"file": relative_root, "description": "Not a File"})
            
        # List files in the current directory
        for file in files:
            file_path = Path(root) / file

            # Check if the file should be ignored
            if any(ignored in file_path.parts for ignored in ignore_list):
                continue

            # Check if the file has an extension that should be skipped
            if file_path.suffix.lower() in ignore_extensions:
                continue
            
            files_to_process.append(file_path)

    # Use tqdm to display progress
    for file_path in tqdm(files_to_process, desc=f"Processing files", unit="file"):
        try:
            # Read file content
            with open(file_path, 'r') as f:
                file_content = f.read()

            # Generate summary using the model
            summary_text = qa_component.call(file_content)
            summary.append({"file": file_path, "description": summary_text})
        except Exception as e:
            summary.append({"file": file_path, "description": f"Error processing file: {str(e)}"})

    return summary

def generate_project_overview(summaries: List[Dict[str, str]], fa_component: OverviewQA) -> str:
    """Generate a concise and descriptive overview of the project based on file summaries."""
    # Combine all file summaries into a single input string for the overview
    combined_summary = " ".join([
        get_description_data(item['description'])
        for item in summary
        if get_description_data(item['description']) and get_description_data(item['description']) != "Not a File" and get_description_data(item['description']) != "." and not get_description_data(item['description']).startswith("HTTP error 401")
    ])
    
    # Generate the project overview using the model
    overview = fa_component.generator.call({"input_str": combined_summary})
    
    return overview

# Create the QA component
model = {
    "model_client": OllamaClient(),
    "model_kwargs": {"model": "llama3.1:8b"}
}
qa = SummaryQA(**model)
fa = OverviewQA(**model)

if repo_name:
    path = Path(repo_name)
    if not path.is_dir():
        print(f"The path {path} is not a directory.")
    # summary = generate_summary(path, ignore_list=ignore_list, qa_component=qa, ignore_extensions=ignore_extensions)

    # Generate the project overview
    project_overview = generate_project_overview(summary, fa_component=fa)

    # Display the summaries in a table
    table = PrettyTable()
    table.field_names = ["File", "Description"]

    for item in summary:
        table.add_row([item["file"], item["description"]])

    print(table)

    # Print the project overview
    print("\nProject Overview:")
    print(get_description_data(project_overview) )

else:
    print("Repository cloning failed or was skipped.")


+--------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------