In [None]:
collab=1
if collab:
  from google.colab import drive
  drive.mount('/content/drive')
  # %cd /content/drive/MyDrive/ReadMe

<!-- @format -->

# Ollama Step-Up


In [122]:
from IPython.display import clear_output
!sudo apt-get install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh # download ollama api
!ollama pull llama3.1:8b
!pip install -U lightrag[ollama]

clear_output()

In [123]:
# Create a Python script to start the Ollama API server in a separate thread

import os
import threading
import subprocess
import requests
import json


def ollama():
    if collab:
        os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
        os.environ['OLLAMA_ORIGINS'] = '*'
        subprocess.Popen(["ollama", "serve"])
    else:
        os.environ['OLLAMA_HOST'] = '127.0.0.1:11434'
        os.environ['OLLAMA_ORIGINS'] = '*'
        subprocess.Popen(["ollama", "serve"])


ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()
clear_output()

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()
clear_output()

In [124]:
from lightrag.core.generator import Generator
from lightrag.core.component import Component
from lightrag.core.model_client import ModelClient
from lightrag.components.model_client import OllamaClient, GroqAPIClient

import time


qa_template = r"""<SYS>
You are a helpful assistant.
</SYS>
User: {{input_str}}
You:"""


class SimpleQA(Component):
    def __init__(self, model_client: ModelClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=qa_template,
        )

    def call(self, input: dict) -> str:
        return self.generator.call({"input_str": str(input)})

    async def acall(self, input: dict) -> str:
        return await self.generator.acall({"input_str": str(input)})

In [125]:
from lightrag.components.model_client import OllamaClient
from IPython.display import Markdown, display
model = {
    "model_client": OllamaClient(),
    "model_kwargs": {"model": "llama3.1:8b"}
}
qa = SimpleQA(**model)
output = qa("what is 2+2")
display(Markdown(f"**Answer:** {output.data}"))

**Answer:** The answer to 2+2 is 4. Is there anything else I can help you with?

<!-- @format -->

# D


In [None]:
from IPython.display import clear_output
# %pip install nest_asyncio
# %pip install prettytable
# %pip install tqdm
# %pip install -U lightrag[ollama]
# %pip install aiohttp
# %pip install pandas
# %pip install openpyxl
clear_output()

In [None]:
# Example usage
repository_url = "https://github.com/Eemayas/Daraz_Scraper"

<!-- @format -->

### GitHub MetaData Extraction


In [None]:
from prettytable import PrettyTable
import asyncio
import aiohttp
from typing import Any, Optional, List
from dataclasses import dataclass
import nest_asyncio
nest_asyncio.apply()


@dataclass
class Contributor:
    name: str
    profile_url: str
    avatar_url: str


@dataclass
class RepositoryMetadata:
    name: str
    full_name: str
    owner: str
    owner_url: Optional[str]
    description: Optional[str]
    stars_count: int
    forks_count: int
    watchers_count: int
    open_issues_count: int
    default_branch: str
    created_at: str
    updated_at: str
    pushed_at: str
    size_kb: int
    clone_url_http: str
    clone_url_ssh: str
    contributors_url: Optional[str]
    languages_url: str
    issues_url: Optional[str]
    language: Optional[str]
    languages: List[str]
    topics: List[str]
    has_wiki: bool
    has_issues: bool
    has_projects: bool
    is_private: bool
    homepage_url: Optional[str]
    license_name: Optional[str]
    license_url: Optional[str]
    contributors: List[Contributor]


def _parse_repository_metadata(repo_data: dict, contributors: List[Contributor]) -> RepositoryMetadata:
    languages = repo_data.get("languages", {})
    license_info = repo_data.get("license", {}) or {}
    owner_info = repo_data.get("owner", {}) or {}

    return RepositoryMetadata(
        name=repo_data.get("name", ""),
        full_name=repo_data.get("full_name", ""),
        owner=owner_info.get("login", ""),
        owner_url=owner_info.get("html_url", ""),
        description=repo_data.get("description", ""),
        stars_count=repo_data.get("stargazers_count", 0),
        forks_count=repo_data.get("forks_count", 0),
        watchers_count=repo_data.get("watchers_count", 0),
        open_issues_count=repo_data.get("open_issues_count", 0),
        default_branch=repo_data.get("default_branch", ""),
        created_at=repo_data.get("created_at", ""),
        updated_at=repo_data.get("updated_at", ""),
        pushed_at=repo_data.get("pushed_at", ""),
        size_kb=repo_data.get("size", 0),
        clone_url_http=repo_data.get("clone_url", ""),
        clone_url_ssh=repo_data.get("ssh_url", ""),
        contributors_url=repo_data.get("contributors_url"),
        languages_url=repo_data.get("languages_url", ""),
        issues_url=repo_data.get("issues_url"),
        language=repo_data.get("language", ""),
        languages=list(languages.keys()) if languages else [],
        topics=repo_data.get("topics", []),
        has_wiki=repo_data.get("has_wiki", False),
        has_issues=repo_data.get("has_issues", False),
        has_projects=repo_data.get("has_projects", False),
        is_private=repo_data.get("private", False),
        homepage_url=repo_data.get("homepage", ""),
        license_name=license_info.get("name", ""),
        license_url=license_info.get("url", ""),
        contributors=contributors
    )


async def _fetch_repository_metadata(session: aiohttp.ClientSession, url: str) -> dict[str, Any]:
    async with session.get(url) as response:
        response.raise_for_status()
        return await response.json()


async def _fetch_contributors(session: aiohttp.ClientSession, url: str) -> List[Contributor]:
    async with session.get(url) as response:
        response.raise_for_status()
        contributors_data = await response.json()
        return [
            Contributor(
                name=contributor.get("login", ""),
                profile_url=contributor.get("html_url", ""),
                avatar_url=contributor.get("avatar_url", "")
            )
            for contributor in contributors_data
        ]


async def fetch_git_repository_metadata(session: aiohttp.ClientSession, repository_url: str) -> Optional[RepositoryMetadata]:
    api_url = repository_url.replace(
        "https://github.com/", "https://api.github.com/repos/")

    try:
        metadata = await _fetch_repository_metadata(session, api_url)
        contributors_url = metadata.get("contributors_url", "")
        contributors = await _fetch_contributors(session, contributors_url) if contributors_url else []
        return _parse_repository_metadata(metadata, contributors) if metadata else None
    except aiohttp.ClientError as exc:
        print(f"Client error while fetching repository metadata: {exc}")
        return None


def print_metadata(metadata: RepositoryMetadata):
    table_metadata = PrettyTable()
    table_metadata.field_names = ["Attribute", "Value"]

    table_metadata.add_row(["Name", metadata.name])
    table_metadata.add_row(["Full Name", metadata.full_name])
    table_metadata.add_row(["Owner", metadata.owner])
    table_metadata.add_row(["Owner URL", metadata.owner_url])
    table_metadata.add_row(["Description", metadata.description])
    table_metadata.add_row(["Stars Count", metadata.stars_count])
    table_metadata.add_row(["Forks Count", metadata.forks_count])
    table_metadata.add_row(["Watchers Count", metadata.watchers_count])
    table_metadata.add_row(["Open Issues Count", metadata.open_issues_count])
    table_metadata.add_row(["Default Branch", metadata.default_branch])
    table_metadata.add_row(["Created At", metadata.created_at])
    table_metadata.add_row(["Updated At", metadata.updated_at])
    table_metadata.add_row(["Pushed At", metadata.pushed_at])
    table_metadata.add_row(["Size (KB)", metadata.size_kb])
    table_metadata.add_row(["Clone URL (HTTP)", metadata.clone_url_http])
    table_metadata.add_row(["Clone URL (SSH)", metadata.clone_url_ssh])
    table_metadata.add_row(["Contributors URL", metadata.contributors_url])
    table_metadata.add_row(["Languages URL", metadata.languages_url])
    table_metadata.add_row(["Issues URL", metadata.issues_url])
    table_metadata.add_row(["Language", metadata.language])
    table_metadata.add_row(["Languages", ", ".join(metadata.languages)])
    table_metadata.add_row(["Topics", ", ".join(metadata.topics)])
    table_metadata.add_row(["Has Wiki", metadata.has_wiki])
    table_metadata.add_row(["Has Issues", metadata.has_issues])
    table_metadata.add_row(["Has Projects", metadata.has_projects])
    table_metadata.add_row(["Is Private", metadata.is_private])
    table_metadata.add_row(["Homepage URL", metadata.homepage_url])
    table_metadata.add_row(["License Name", metadata.license_name])
    table_metadata.add_row(["License URL", metadata.license_url])

    print(table_metadata)

    if metadata.contributors:
        contributors_table = PrettyTable()
        contributors_table.field_names = [
            "Contributor Name", "Profile URL", "Avatar URL"]
        for contributor in metadata.contributors:
            contributors_table.add_row(
                [contributor.name, contributor.profile_url, contributor.avatar_url])
        print(contributors_table)


async def main(repository_url: str):
    async with aiohttp.ClientSession() as session:
        metadata = await fetch_git_repository_metadata(session, repository_url)
        if metadata:
            print_metadata(metadata)

await main(repository_url)

<!-- @format -->

### Ignore List


In [126]:
ignore_list = [
    # General
    '.git',            # Git repository metadata
    'node_modules',    # Node.js modules
    '.idea',           # JetBrains IDE project files
    '.vscode',         # Visual Studio Code settings
    '__pycache__',     # Python bytecode cache
    '.DS_Store',       # macOS directory metadata
    '.env',            # Environment variable files
    'venv',            # Python virtual environment
    'build',           # Build output directories
    'dist',            # Distribution directories
    'target',          # Output from Java and Rust builds
    '.pytest_cache',   # Pytest cache files
    '*.log',           # Log files
    '*.tmp',           # Temporary files

    # Python
    '*.pyc',           # Compiled Python files
    '.mypy_cache',     # Mypy type checker cache
    '.tox',            # Tox environment

    # JavaScript/Node.js
    'npm-debug.log',   # NPM debug logs
    'yarn-error.log',  # Yarn error logs
    '.parcel-cache',   # Parcel bundler cache
    'coverage',        # Code coverage reports
    '.next',           # Next.js build directory
    'out',             # Output directory for Next.js

    # Java
    '*.class',         # Compiled Java classes
    '*.jar',           # JAR files
    '*.war',           # WAR files
    '.settings',       # Eclipse settings
    '.classpath',      # Eclipse classpath
    '.project',        # Eclipse project file

    # C/C++
    '*.o',             # Object files
    '*.a',             # Static libraries
    '*.so',            # Shared libraries
    '*.out',           # Executable files
    '*.exe',           # Windows executables
    'CMakeFiles',      # CMake build files
    'CMakeCache.txt',  # CMake cache
    '*.dSYM',          # macOS debug symbols
    '*.pdb',           # Windows debug symbols

    # Rust
    '*.rlib',          # Rust libraries
    'Cargo.lock',      # Cargo lock file

    # Go
    'bin',             # Binary output directory
    'pkg',             # Package output directory
    '*.test',          # Go test binaries
    'vendor',          # Vendor directory (if not used)

    # Ruby
    '.bundle',         # Bundler directory
    'vendor/bundle',   # Bundled gems
    'log',             # Log files
    'tmp',             # Temporary files
    '.gem',            # RubyGems metadata

    # PHP
    'vendor',          # Composer dependencies
    '.phpunit.result.cache',  # PHPUnit result cache

    # Android
    '.gradle',         # Gradle files
    '*.apk',           # Android package
    '*.ap_ ',          # Android resources package
    'local.properties',  # Android SDK settings

    # .NET/C#
    'bin',             # Binary output directory
    'obj',             # Object files directory
    '*.dll',           # DLL files
    '*.user',          # User settings
    'packages',        # NuGet packages

    # LaTeX
    '*.aux',           # Auxiliary files
    '*.toc',           # Table of contents
    '*.out',           # Auxiliary output files
    '*.synctex.gz',    # SyncTeX file
    '*.fls',           # LaTeX build files
    '*.fdb_latexmk',   # LaTeX build files
]

# Specify the file extensions to ignore
# Specify the file extensions to ignore
ignore_extensions = [
    # Image formats
    '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.tiff', '.webp', '.heif', '.heic', '.ico', '.raw', '.psd',

    # Audio formats
    '.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma', '.aiff', '.alac', '.pcm',

    # Video formats
    '.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.m4v', '.mpg', '.mpeg', '.3gp', '.ogv', '.rm', '.swf'
]
api_additional_extensions =  [
    # Text and document formats
    '.copy', '.local', '.json', '.config', '.md', '.txt', '.log', '.yml', '.yaml', '.xml', '.ini', '.pdf', '.csv', '.tsv',

    # Font formats
    '.woff', '.woff2', '.ttf', '.eot', '.otf',

    # Configuration and map files
    '.config.ts', '.map', '.lock',

    # Styling files
    '.css', '.scss', '.sass', '.less', '.styl', '.pcss', '.postcss'
]
api_ignore_extensions = ignore_extensions + api_additional_extensions
specific_ignores_api = ['.gitignore', '.config.js','.config.ts']

<!-- @format -->

### Folder Structure


In [None]:
import subprocess
import os
from pathlib import Path
from typing import List, Optional
import asyncio


def print_folder_structure(dir_path: Path, level: int = -1, limit_to_directories: bool = False, length_limit: int = 1000, ignore_list: List[str] = None) -> List[str]:
    """Generate a visual tree structure of the directory contents.

    Args:
        dir_path (Path): The root directory to start the tree from.
        level (int, optional): The depth of recursion. Defaults to -1 (no limit).
        limit_to_directories (bool, optional): If True, only directories are listed. Defaults to False.
        length_limit (int, optional): Limits the number of lines output. Defaults to 1000.
        ignore_list (List[str], optional): A list of directory or file names to ignore. Defaults to None.

    Returns:
        List[str]: A list of strings representing the directory tree structure.
    """
    space = '    '
    branch = '│   '
    tee = '├── '
    last = '└── '
    dir_path = Path(dir_path)  # Ensure dir_path is a Path object
    files = 0
    directories = 0
    output = []

    if ignore_list is None:
        ignore_list = []

    def inner(dir_path: Path, prefix: str = '', level: int = -1):
        nonlocal files, directories
        if level == 0:
            return  # Stop recursion if level is 0
        if limit_to_directories:
            contents = [d for d in dir_path.iterdir() if d.is_dir()
                        and d.name not in ignore_list]
        else:
            contents = [d for d in dir_path.iterdir()
                        if d.name not in ignore_list]
        pointers = [tee] * (len(contents) - 1) + [last]
        for pointer, path in zip(pointers, contents):
            if path.is_dir():
                output.append(prefix + pointer + path.name + "/")
                directories += 1
                extension = branch if pointer == tee else space
                inner(path, prefix=prefix + extension, level=level - 1)
            elif not limit_to_directories:
                output.append(prefix + pointer + path.name)
                files += 1

    # Add the root directory name
    output.append(dir_path.name + "/")
    # Create an iterator from the inner function
    inner(dir_path, level=level)
    # Limit the output by length_limit
    if len(output) > length_limit:
        output = output[:length_limit]
        output.append(f'... length_limit, {length_limit}, reached, counted:')
    # Add the summary of directories and files
    output.append(f'\n{directories} directories' +
                  (f', {files} files' if files else ''))

    return output


async def clone_github_repo(repository_url: str) -> Optional[str]:
    repo_name = repository_url.split('/')[-1]

    if not os.path.exists(repo_name):
        print(f"Cloning repository from {repository_url}...")
        try:
            subprocess.run(['git', 'clone', repository_url], check=True)
            print(f"Repository cloned into {repo_name}/")
            return repo_name
        except subprocess.CalledProcessError as e:
            print(f"Error cloning repository: {e}")
            return None
    else:
        print(
            f"Repository folder '{repo_name}' already exists. Skipping clone.")
        return repo_name

# Clone the repository
repo_name = await clone_github_repo(repository_url=repository_url)

if repo_name:
    # Print the folder structure
    folder_structure = print_folder_structure(
        dir_path=Path(repo_name),
        ignore_list=ignore_list
    )
    folder_structure_str = "\n".join(folder_structure)
    folder_structure_markdown = (
        "# Folder Structure\n" +
        "```sh\n" +
        folder_structure_str + "\n" +
        "```"
        )
    print(folder_structure_markdown)
else:
    print("Repository cloning failed or was skipped.")

<!-- @format -->

### Summary Generation


In [None]:
import os
from pathlib import Path
from typing import List, Dict
from prettytable import PrettyTable
from lightrag.core.generator import Generator
from lightrag.core.component import Component
from lightrag.components.model_client import OllamaClient
from tqdm import tqdm

summary_template = r"""<SYS>
You are a summarization assistant specialized in coding files.
</SYS>
Please summarize the following code:
{{input_str}}
Summary:"""


class SummaryQA(Component):
    def __init__(self, model_client: OllamaClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=summary_template,
        )

    def call(self, input: str) -> str:
        return self.generator.call({"input_str": input})

    async def acall(self, input: str) -> str:
        return await self.generator.acall({"input_str": input})


def generate_summary(path: Path, ignore_list: List[str], qa_component: SummaryQA, ignore_extensions: List[str]) -> List[Dict[str, str]]:
    """Generate a summary of files in the given path using the model."""
    summary = []
    files_to_process = []

    for root, dirs, files in os.walk(path):
        # Get relative path for the current directory
        relative_root = os.path.relpath(root, path)

        # Check if the directory should be ignored
        if any(ignored in relative_root.split(os.sep) for ignored in ignore_list):
            continue

        if relative_root == '.':
            summary.append({"file": "Modules", "description": "."})
        else:
            summary.append(
                {"file": relative_root, "description": "Not a File"})

        # List files in the current directory
        for file in files:
            file_path = Path(root) / file

            # Check if the file should be ignored
            if any(ignored in file_path.parts for ignored in ignore_list):
                continue

            # Check if the file has an extension that should be skipped
            if file_path.suffix.lower() in ignore_extensions:
                continue

            files_to_process.append(file_path)

    # Use tqdm to display progress
    for file_path in tqdm(files_to_process, desc=f"Processing files-{file_path}", unit="file"):
        try:
            # Read file content
            with open(file_path, 'r') as f:
                file_content = f.read()

            # Generate summary using the model
            summary_text = qa_component.call(file_content)
            summary.append({"file": file_path, "description": summary_text})
        except Exception as e:
            summary.append(
                {"file": file_path, "description": f"Error processing file: {str(e)}"})

    return summary


# Create the QA component
model = {
    "model_client": OllamaClient(),
    "model_kwargs": {"model": "llama3.1:8b"}
}
qa = SummaryQA(**model)


if repo_name:
    path = Path(repo_name)
    if not path.is_dir():
        print(f"The path {path} is not a directory.")
    summary = generate_summary(path, ignore_list=ignore_list,
                               qa_component=qa, ignore_extensions=ignore_extensions)
    table_summary = PrettyTable()
    table_summary.field_names = ["File", "Description"]

    for item in summary:
        table_summary.add_row([item["file"], item["description"]])

    print(table_summary)
else:
    print("Repository cloning failed or was skipped.")

In [None]:
import pandas as pd
from pathlib import Path
from prettytable import PrettyTable

# Sample function to check if an object has the `data` attribute


def get_description_data(description):
    if hasattr(description, 'data'):
        return description.data
    return description


if summary:
    # Initialize PrettyTable
    table_summary = PrettyTable()
    table_summary.field_names = ["File", "Description"]

    # Create a list to hold data for the DataFrame
    data_for_excel = []

    for item in summary:
        description_data = get_description_data(item["description"])
        table_summary.add_row([item["file"], description_data])
        data_for_excel.append(
            {"File": item["file"], "Description": description_data})

    # Print the PrettyTable
    print(table_summary)

    # Convert the list to a DataFrame
    df_summary = pd.DataFrame(data_for_excel)

    # Define the path and name for the Excel file
    excel_path = 'summary.xlsx'

    # Save DataFrame to an Excel file
    df_summary.to_excel(excel_path, index=False, engine='openpyxl')
    print(f"Summary saved to {excel_path}")

In [None]:
from prettytable import PrettyTable

# Sample function to check if an object has the `data` attribute


def get_description_data(description):
    if hasattr(description, 'data'):
        return description.data
    return description


def is_empty_or_error(description):
    description_str = get_description_data(
        description).strip() if isinstance(description, str) else ""
    return not description_str or "HTTP error 401" in description_str


if summary:
    # Variable to store rows before adding to the table
    blank_error_summary = []

    for item in summary:
        description_data = get_description_data(item["description"])
        if is_empty_or_error(description_data):
            # Save the row data into a variable
            row = [item["file"], description_data]
            blank_error_summary.append(row)

    # Initialize PrettyTable
    retry_table = PrettyTable()
    retry_table.field_names = ["File", "Description"]

    # Add the saved rows to the table
    for row in blank_error_summary:
        retry_table.add_row(row)

    # Print the PrettyTable
    print(retry_table)

else:
    print("NO SUMMARY DATA AVAILABLE")

In [None]:
def generate_summary_for_file(file_path: Path, qa_component: SummaryQA, existing_summaries: List[Dict[str, str]]) -> List[Dict[str, str]]:
    """Generate or update a summary for a single file using the model."""
    file_name = file_path.name
    updated = False

    # Check if the file summary already exists
    for summary in existing_summaries:
        if summary["file"] == file_name:
            updated = True
            break

    if not updated:
        existing_summaries.append({"file": file_name, "description": ""})

    try:
        # Read file content
        with open(file_path, 'r') as f:
            file_content = f.read()

        # Generate summary using the model
        summary_text = qa_component.call(file_content)

        # Update the summary in the list
        for summary in existing_summaries:
            if summary["file"] == file_name:
                summary["description"] = summary_text
                break
    except Exception as e:
        for summary in existing_summaries:
            if summary["file"] == file_name:
                summary["description"] = f"Error processing file {file_path}: {str(e)}"
                break

    return existing_summaries


if blank_error_summary:
    # Initialize an empty list to store summaries
    summaries = []

    # Prompt the user for a file path
    user_input = input(

        "Please enter the path to the file you want to summarize: ")
    file_path = Path(user_input)

    if file_path.is_file():
        # Generate or update the summary for the specified file
        summaries = generate_summary_for_file(file_path, qa, summaries)

        # Print the summary using PrettyTable
        table_summary = PrettyTable()
        table_summary.field_names = ["File", "Description"]

        for summary in summaries:
            table_summary.add_row([summary["file"], summary["description"]])

        print(table_summary)
    else:
        print(f"The path {file_path} is not a valid file.")

In [None]:
if summary:
    # Combine summaries, ignoring "Not a File" or error messages
    combined_summary = " ".join([
        get_description_data(item['description'])
        for item in summary
        if get_description_data(item['description']) and get_description_data(item['description']) != "Not a File" and get_description_data(item['description']) != "." and not get_description_data(item['description']).startswith("HTTP error 401")
    ])
    print(combined_summary)

In [None]:
import os
from pathlib import Path
from typing import List, Dict
from prettytable import PrettyTable
from lightrag.core.generator import Generator
from lightrag.core.component import Component
from lightrag.components.model_client import OllamaClient
from tqdm import tqdm

project_overview_template = r"""<SYS>
You are a summarization assistant specialized in project documentation.
</SYS>
Based on the provided file summaries:
{{input_str}},

Generate a concise and descriptive one-paragraph overview of the project, including:
1. What the project is about (2 sentences).
2. What the project does(more than 3 sentences).
3. The technologies used.
4. The key features of the project.

Dont add predescription and post decription in the ans
Summary:"""


class OverviewQA(Component):
    def __init__(self, model_client: OllamaClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=project_overview_template,
        )

    def call(self, summaries: str) -> str:
        return self.generator.call({"input_str": summaries})

    async def acall(self, summaries: str) -> str:
        return await self.generator.acall({"input_str": summaries})


def generate_project_overview(combined_summary: str, oa_component: OverviewQA) -> str:
    """Generate a concise and descriptive overview of the project based on file summaries."""

    # Generate the project overview using the model
    overview = oa_component.generator.call({"input_str": combined_summary})

    return overview


oa = OverviewQA(**model)

if combined_summary:
    # Generate the project overview
    project_overview = generate_project_overview(summary, oa_component=oa)
    project_overview = get_description_data(project_overview)
    project_overview_markdown = (
        "# Project Overview\n\n" +
        project_overview.strip().replace("\n\n", "\n\n").replace("  ", " ")
        )
    # Print the project overview
    print(project_overview_markdown)
    display(Markdown(project_overview_markdown))

In [None]:
import os
from pathlib import Path
from typing import List, Dict
from prettytable import PrettyTable
from lightrag.core.generator import Generator
from lightrag.core.component import Component
from lightrag.components.model_client import OllamaClient
from tqdm import tqdm

key_feature_template = r"""<SYS>
You are an expert computer engineer specializing in project documentation and coding, with advanced knowledge of various programming technologies.
</SYS>
Based on the provided file summaries:
{{input_str}},

Extract and list the key features (minimun 5 features) in a concise format. Each feature should include:
- Feature Name: A brief description of the feature and its significance.

Use the following format for listing the features:
- **Feature Name**: Description of the feature and its significance.

Ensure that the features are listed clearly and concisely, highlighting the most important aspects and functionalities that define the project’s value and just give me bulletin. No explaination need before and after bulletin likes "Here are the key features extracted from the provided code snippets:", "Let me know if you'd like me to help with anything else!"
Summary:
"""


class FeatureQA(Component):
    def __init__(self, model_client: OllamaClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=key_feature_template,
        )

    def call(self, summaries: str) -> str:
        return self.generator.call({"input_str": summaries})

    async def acall(self, summaries: str) -> str:
        return await self.generator.acall({"input_str": summaries})


def generate_key_feature(combined_summary: str, fa_component: FeatureQA) -> str:

    overview = fa_component.generator.call({"input_str": combined_summary})

    return overview


fa = FeatureQA(**model)

if combined_summary:
    # Generate the project overview
    key_feature = generate_key_feature(summary, fa_component=fa)
    key_feature = get_description_data(key_feature)
    key_feature_markdown = (
        "# Key Features\n" + key_feature)
    print(key_feature_markdown)
    display(Markdown(key_feature_markdown))

In [None]:
installation_template_v1 = r"""<SYS>
You are a highly skilled software engineer with expertise in documentation and project setup. You are adept at analyzing project summaries and folder structures to create clear installation instructions.
</SYS>
Based on the following project summary and folder structure:
Summary:
{{project_summary}}

Folder Structure:
{{folder_structure}}

GitHub repo Link:
{{repo_link}}

Create a detailed installation guide that includes:
1. **Prerequisites**: List any software, tools, or environment setups required before installation (e.g., Node.js, Docker) and provide link to download or install them.
2. **Setup Instructions**: Step-by-step instructions to set up the project, including installing dependencies, configuring environment variables, and any other necessary setup.
3. **Running the Project**: Detailed commands and steps to run the project locally, including any necessary build steps or configuration commands.
4. **Troubleshooting**: Common issues that may arise during installation and how to resolve them.
Ensure that the installation guide is comprehensive and easy to follow for someone new to the project and properly format them with heading like # Getting Started, ## Prerequisites, ## Installation, ## Running the Project, ## Tests, ## Troubleshooting
Summary:"""

class InstallationQA(Component):
    def __init__(self, model_client: OllamaClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=installation_template_v1,
        )

    def call(self, project_summary: str, folder_structure: str,repo_link:str) -> str:
        return self.generator.call({
            "project_summary": project_summary,
            "folder_structure": folder_structure,
            "repo_link": repo_link
        })

    async def acall(self, project_summary: str, folder_structure: str,repo_link:str) -> str:
        return await self.generator.acall({
            "project_summary": project_summary,
            "folder_structure": folder_structure,
            "repo_link": repo_link
        })


def generate_installation_guide(project_summary: str, folder_structure: str,repo_link:str, ia_component: InstallationQA) -> str:
    # Use the InstallationQA component to generate the installation guide
    installation_guide = ia_component.call(project_summary, folder_structure,repo_link)

    return installation_guide


ia = InstallationQA(**model)

if combined_summary:
    # Generate the installation guide
    installation_guide = generate_installation_guide(
        combined_summary, folder_structure_str,repository_url, ia_component=ia)
    installation_guide = get_description_data(installation_guide)

    installation_guide_markdown = ( installation_guide)
    print(installation_guide_markdown)
    display(Markdown(installation_guide_markdown))

In [149]:
import os
from pathlib import Path
from typing import List, Dict
from prettytable import PrettyTable
from lightrag.core.generator import Generator
from lightrag.core.component import Component
from lightrag.components.model_client import OllamaClient
from tqdm import tqdm

api_template = r"""<SYS>
You are an API reference extraction assistant specialized in coding files.
</SYS>
Please extract the API reference from the following code and provide the following information:
1. API endpoint
2. Purpose of the API
3. Parameters
4. Parameter types
5. Parameter descriptions
6. HTTP method

Format:

#### {Purpose of the API}

```http
  {HTTP method} {API endpoint}
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
{parameter_rows}

Example:

#### Get all items

```http
  GET /api/items
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `api_key` | `string` | **Required**. Your API key |
| `limit`   | `integer`| **Optional**. Limit the number of items |


If there is no API Reference, please return "No API Reference". No description needed in that case. Avoid asking for response like this "Let me know if you'd like me to clarify anything!" and write the Notes in third person narrative

Code:
{{input_str}}
"""


class APIReferenceExtractor(Component):
    def __init__(self, model_client: OllamaClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=api_template,
        )

    def call(self, input: str) -> str:
        return self.generator.call({"input_str": input})

    async def acall(self, input: str) -> str:
        return await self.generator.acall({"input_str": input})


def generate_api_reference(path: Path, ignore_list: List[str], api_component: APIReferenceExtractor, ignore_extensions: List[str]) -> List[Dict[str, str]]:
    """Generate an API reference of files in the given path using the model."""
    api_reference = []
    files_to_process = []

    for root, dirs, files in os.walk(path):
      relative_root = os.path.relpath(root, path)

      if any(ignored in relative_root.split(os.sep) for ignored in ignore_list):
          continue

      for file in files:
          file_path = Path(root) / file

          if any(ignored in file_path.parts for ignored in ignore_list):
              continue

          if any(ignore in file_path.name for ignore in specific_ignores_api):
              continue

          if file_path.suffix.lower() in api_ignore_extensions:
              continue

          files_to_process.append(file_path)

    for file_path in tqdm(files_to_process, desc=f"Processing files-{file_path}", unit="file"):
        try:
            with open(file_path, 'r') as f:
                file_content = f.read()

            api_text = api_component.call(file_content)
            api_reference.append(
                {"file": file_path, "api_reference": api_text})
        except Exception as e:
            api_reference.append(
                {"file": file_path, "api_reference": f"Error processing file: {str(e)}"})

    return api_reference


api_a = APIReferenceExtractor(**model)

if repo_name:
    path = Path(repo_name)
    if not path.is_dir():
        print(f"The path {path} is not a directory.")
    api_reference = generate_api_reference(path, ignore_list=ignore_list,
                                           api_component=api_a, ignore_extensions=ignore_extensions)

    # Initialize PrettyTable
    api_table = PrettyTable()
    api_table.field_names = ["File", "api_reference"]

    # Create a list to hold data for the DataFrame
    api_data_for_excel = []

    for item in api_reference:
        description_data = get_description_data(item["api_reference"])
        if "No API Reference" in description_data:
            continue
        api_table.add_row([item["file"], description_data])
        api_data_for_excel.append(
            {"File": item["file"], "api_reference": description_data})

    # Print the PrettyTable
    print(api_table)
else:
    print("Repository cloning failed or was skipped.")

Processing files-Daraz_Scraper/public/assets/images/hero-2.svg: 100%|██████████| 17/17 [00:51<00:00,  3.05s/file]

+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                   File                   |                                                                                                                  api_reference                                                                                                                  |
+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Daraz_Scraper/app/products/[id]/page.tsx |                                                                                               




In [150]:
for item in api_reference:
  if isinstance(item, dict) and "api_reference" in item:
      description_data = get_description_data(item["api_reference"])
      if "No API Reference" in description_data:
          continue
      api_table.add_row([item["file"], description_data])
      api_data_for_excel.append(
          {"File": item["file"], "api_reference": description_data})
else:
    print(f"Unexpected item format: {item}\n")

Unexpected item format: {'file': PosixPath('Daraz_Scraper/lib/action/index.ts'), 'api_reference': GeneratorOutput(data='#### Product Operations\n\n```http\nPOST /api/products/scrapeAndStoreProduct\n```\n\n| Parameter | Type     | Description                |\n| :-------- | :------- | :------------------------- |\n| `productUrl` | `string` | **Required**. The URL of the product to scrape and store |\n\n---\n\n#### Get Product by ID\n\n```http\nGET /api/products/getProductById/:productId\n```\n\n| Parameter | Type     | Description                |\n| :-------- | :------- | :------------------------- |\n| `productId` | `string` | **Required**. The ID of the product to retrieve |\n\n---\n\n#### Get All Products\n\n```http\nGET /api/products/getAllProducts\n```\n\n| Parameter | Type     | Description                |\n| :-------- | :------- | :------------------------- |\n| `limit`   | `integer`| **Optional**. Limit the number of products (default: 10) |\n\n---\n\n#### Get Similar Products

In [151]:
if api_data_for_excel:
    # Convert the list to a DataFrame
    df_api_data = pd.DataFrame(api_data_for_excel)

    # Define the path and name for the Excel file
    excel_path = 'api_reference.xlsx'

    # Save DataFrame to an Excel file
    df_api_data.to_excel(excel_path, index=False, engine='openpyxl')
    print(f"api_reference saved to {excel_path}")

api_reference saved to api_reference.xlsx


In [164]:
# Convert to string in the desired format
api_reference_markdown = "# API Reference\n"
for i, entry in enumerate(api_data_for_excel, start=1):
    file = entry["File"]
    api_reference = entry["api_reference"]
    # api_reference_markdown += f"{i}. **{file}**\n{api_reference}\n\n"
    api_reference_markdown += f"{api_reference}\n"

print(api_reference_markdown)
display(Markdown(api_reference_markdown))

# API Reference
#### Get product by ID and similar products

```http
  GET /api/product/{id}
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `{id}`    | `string` | **Required**. The product ID |

Note: There is no API reference for getting all similar products, only a specific product by ID and its similar products.
#### Update products and send email notifications

```http
POST /api/products/update-and-send-email-notifications
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. The URL of the product to update            |
| `users`    | `array`  | **Optional**. An array of user emails to send email notifications to |

Note: There is only one API endpoint in this code, and it's a POST request to `/api/products/update-and-send-email-notifications`.
#### Add user email to product

```http
  POST /api/product/{productId}/a

# API Reference
#### Get product by ID and similar products

```http
  GET /api/product/{id}
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `{id}`    | `string` | **Required**. The product ID |

Note: There is no API reference for getting all similar products, only a specific product by ID and its similar products.
#### Update products and send email notifications

```http
POST /api/products/update-and-send-email-notifications
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. The URL of the product to update            |
| `users`    | `array`  | **Optional**. An array of user emails to send email notifications to |

Note: There is only one API endpoint in this code, and it's a POST request to `/api/products/update-and-send-email-notifications`.
#### Add user email to product

```http
  POST /api/product/{productId}/add-email
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `email`   | `string` | **Required**. Email address of the user |
| `productId`| `string`| **Required**. ID of the product |

Note: The `/api/product/{productId}/add-email` endpoint is not a standard API endpoint, but rather a custom endpoint created by the `addUserEmailToProduct` function in the code.
#### Scrape and store product information from Daraz URL

```http
POST /api/scrapeProduct
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. The Daraz product URL to scrape |
#### Scrape Daraz Product Information

```http
GET /api/scrape-daraz-product
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. URL of the product to scrape |
| `username` | `string` | **Required**. Bright Data username |
| `password` | `string` | **Required**. Bright Data password |
| `port`     | `integer`| **Required**. Proxy port number |
#### Product Operations

```http
POST /api/products/scrapeAndStoreProduct
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productUrl` | `string` | **Required**. The URL of the product to scrape and store |

---

#### Get Product by ID

```http
GET /api/products/getProductById/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to retrieve |

---

#### Get All Products

```http
GET /api/products/getAllProducts
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `limit`   | `integer`| **Optional**. Limit the number of products (default: 10) |

---

#### Get Similar Products

```http
GET /api/products/getSimilarProducts/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to find similar products for |

---

#### Add User Email to Product

```http
POST /api/products/addUserEmailToProduct/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to add a user email to |
| `userEmail`  | `string` | **Required**. The email address of the user to add |

---

#### Note:
The code provided contains API endpoints for performing various operations related to products, such as scraping and storing products, retrieving products by ID or all products, finding similar products, and adding user emails to products.
#### Get product by ID and similar products

```http
  GET /api/product/{id}
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `{id}`    | `string` | **Required**. The product ID |

Note: There is no API reference for getting all similar products, only a specific product by ID and its similar products.
#### Update products and send email notifications

```http
POST /api/products/update-and-send-email-notifications
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. The URL of the product to update            |
| `users`    | `array`  | **Optional**. An array of user emails to send email notifications to |

Note: There is only one API endpoint in this code, and it's a POST request to `/api/products/update-and-send-email-notifications`.
#### Add user email to product

```http
  POST /api/product/{productId}/add-email
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `email`   | `string` | **Required**. Email address of the user |
| `productId`| `string`| **Required**. ID of the product |

Note: The `/api/product/{productId}/add-email` endpoint is not a standard API endpoint, but rather a custom endpoint created by the `addUserEmailToProduct` function in the code.
#### Scrape and store product information from Daraz URL

```http
POST /api/scrapeProduct
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. The Daraz product URL to scrape |
#### Scrape Daraz Product Information

```http
GET /api/scrape-daraz-product
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. URL of the product to scrape |
| `username` | `string` | **Required**. Bright Data username |
| `password` | `string` | **Required**. Bright Data password |
| `port`     | `integer`| **Required**. Proxy port number |
#### Product Operations

```http
POST /api/products/scrapeAndStoreProduct
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productUrl` | `string` | **Required**. The URL of the product to scrape and store |

---

#### Get Product by ID

```http
GET /api/products/getProductById/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to retrieve |

---

#### Get All Products

```http
GET /api/products/getAllProducts
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `limit`   | `integer`| **Optional**. Limit the number of products (default: 10) |

---

#### Get Similar Products

```http
GET /api/products/getSimilarProducts/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to find similar products for |

---

#### Add User Email to Product

```http
POST /api/products/addUserEmailToProduct/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to add a user email to |
| `userEmail`  | `string` | **Required**. The email address of the user to add |

---

#### Note:
The code provided contains API endpoints for performing various operations related to products, such as scraping and storing products, retrieving products by ID or all products, finding similar products, and adding user emails to products.


In [180]:

def generate_contributing_guide(repo_link):
    # Extract the username and repository name from the link
    import re
    match = re.match(r'https://github.com/([^/]+)/([^/]+)', repo_link)
    if not match:
        raise ValueError("Invalid GitHub repository link")

    username, repo_name = match.groups()

    # Define the guide with placeholders for URLs
    guide_template = f"""
# Contributing

Contributions are welcome! Here are several ways you can contribute:

- **[Submit Pull Requests](https://github.com/{username}/{repo_name}/pulls)**: Review open PRs, and submit your own PRs.
- **[Join the Discussions](https://github.com/{username}/{repo_name}/discussions)**: Share your insights, provide feedback, or ask questions.
- **[Report Issues](https://github.com/{username}/{repo_name}/issues)**: Submit bugs found or log feature requests for {repo_name}.

### Contributing Guidelines

1. **Fork the Repository**:
  - Start by forking the project repository to your GitHub account.
2. **Clone the Repository**:
  - Clone your forked repository to your local machine using the command:
  ```sh
  git clone https://github.com/your-username/{repo_name}.git
  ```
  - Replace your-username and repository-name with your GitHub username.
4. **Create a New Branch**:
  - Create a new branch for your changes using the command:
  ```sh
  git checkout -b your-branch-name
  ```
5. **Make Your Changes**:
  - Edit, add, or delete files as needed. Ensure your changes align with the project's contribution guidelines.
6. **Commit Your Changes**: Commit with a clear message describing your updates.
  - Stage your changes and commit them with a descriptive message:
    ```bash
    git add .
    git commit -m "Your descriptive message"
    ```
7. **Push Your Changes:**
  - Push your branch to your forked repository:
    ```bash
    git push origin your-branch-name
    ```
8. **Create a Pull Request (PR):**
  - Go to the original repository on GitHub and click “Compare & pull request.” Provide a clear description of the changes and submit the PR.

Once your PR is reviewed and approved, it will be merged into the main branch.
        """

    return guide_template
contribution_markdown=generate_contributing_guide(repository_url)
print(contribution_markdown)
display(Markdown(contribution_markdown))



# Contributing

Contributions are welcome! Here are several ways you can contribute:

- **[Submit Pull Requests](https://github.com/Eemayas/Daraz_Scraper/pulls)**: Review open PRs, and submit your own PRs.
- **[Join the Discussions](https://github.com/Eemayas/Daraz_Scraper/discussions)**: Share your insights, provide feedback, or ask questions.
- **[Report Issues](https://github.com/Eemayas/Daraz_Scraper/issues)**: Submit bugs found or log feature requests for Daraz_Scraper.

### Contributing Guidelines

1. **Fork the Repository**: 
  - Start by forking the project repository to your GitHub account.
2. **Clone the Repository**:
  - Clone your forked repository to your local machine using the command:
  ```sh
  git clone https://github.com/your-username/Daraz_Scraper.git
  ```
  - Replace your-username and repository-name with your GitHub username.
4. **Create a New Branch**:
  - Create a new branch for your changes using the command:
  ```sh
  git checkout -b your-branch-name
  ```
5.


# Contributing

Contributions are welcome! Here are several ways you can contribute:

- **[Submit Pull Requests](https://github.com/Eemayas/Daraz_Scraper/pulls)**: Review open PRs, and submit your own PRs.
- **[Join the Discussions](https://github.com/Eemayas/Daraz_Scraper/discussions)**: Share your insights, provide feedback, or ask questions.
- **[Report Issues](https://github.com/Eemayas/Daraz_Scraper/issues)**: Submit bugs found or log feature requests for Daraz_Scraper.

### Contributing Guidelines

1. **Fork the Repository**: 
  - Start by forking the project repository to your GitHub account.
2. **Clone the Repository**:
  - Clone your forked repository to your local machine using the command:
  ```sh
  git clone https://github.com/your-username/Daraz_Scraper.git
  ```
  - Replace your-username and repository-name with your GitHub username.
4. **Create a New Branch**:
  - Create a new branch for your changes using the command:
  ```sh
  git checkout -b your-branch-name
  ```
5. **Make Your Changes**: 
  - Edit, add, or delete files as needed. Ensure your changes align with the project's contribution guidelines.
6. **Commit Your Changes**: Commit with a clear message describing your updates.
  - Stage your changes and commit them with a descriptive message:
    ```bash
    git add .
    git commit -m "Your descriptive message"
    ```
7. **Push Your Changes:**
  - Push your branch to your forked repository:
    ```bash
    git push origin your-branch-name
    ```
8. **Create a Pull Request (PR):**
  - Go to the original repository on GitHub and click “Compare & pull request.” Provide a clear description of the changes and submit the PR.

Once your PR is reviewed and approved, it will be merged into the main branch.
        

In [181]:
license_markdown="""
# License

This project is licensed under the MIT License - see the [LICENSE](./LICENSE) file for details.

"""

<!-- @format -->

# Markdown


In [182]:
# markdown_template = r"""<SYS>
# You are an assistant that specializes in converting text into Markdown format.
# </SYS>
# Please convert the following into a Markdown formatted document:
# {{input_str}}
# Dont have predescription like "Here is the converted Markdown formatted document:\n\n" and
# Markdown:"""


# class MarkdownConverter(Component):
#     def __init__(self, model_client: OllamaClient, model_kwargs: dict):
#         super().__init__()
#         self.generator = Generator(
#             model_client=model_client,
#             model_kwargs=model_kwargs,
#             template=markdown_template,
#         )

#     def call(self, input: str) -> str:
#         return self.generator.call({"input_str": input})

#     async def acall(self, input: str) -> str:
#         return await self.generator.acall({"input_str": input})


# markdown_converter = MarkdownConverter(**model)

# # Convert the project overview to Markdown using the LLM
# key_feature_markdown_output = markdown_converter.call(key_feature)

# # Print the Markdown output
# print(get_description_data(key_feature_markdown_output))

In [183]:
# # Convert the project overview to Markdown using the LLM
# api_reference_markdown_output = markdown_converter.call(
#     api_reference)

# # Print the Markdown output
# print(get_description_data(api_reference_markdown_output))

In [188]:
combined_markdown = project_overview_markdown+"\n\n---\n" +\
    key_feature_markdown+"\n\n---\n" +\
    folder_structure_markdown+"\n\n---\n" +\
    installation_guide_markdown+"\n\n---\n" +\
    api_reference_markdown+"\n\n---\n"+\
    contribution_markdown+"\n\n---\n"+\
    license_markdown+"\n\n---\n"+\
    ""
print(combined_markdown)
display(Markdown(combined_markdown))

# Project Overview

The project is a product data scraper and database management system that utilizes Mongoose for MongoDB interaction, Next.js caching library, Daraz scraper, and Email sending using Nodemailer to collect and store product information. The system is designed to scrape products from Daraz, update price history, retrieve individual products, return all stored products, find similar products, and add user emails with welcome email notifications. This project showcases an efficient integration of multiple technologies, including Mongoose for MongoDB interaction, Next.js caching library, Daraz scraper, and Email sending using Nodemailer, to create a comprehensive product data management system featuring key features such as product scraping, price history updates, and personalized user engagement through email notifications.

---
# Key Features
* **Product Model Creation**: A Mongoose model is created to store product data with various fields related to pricing, ratings, r

# Project Overview

The project is a product data scraper and database management system that utilizes Mongoose for MongoDB interaction, Next.js caching library, Daraz scraper, and Email sending using Nodemailer to collect and store product information. The system is designed to scrape products from Daraz, update price history, retrieve individual products, return all stored products, find similar products, and add user emails with welcome email notifications. This project showcases an efficient integration of multiple technologies, including Mongoose for MongoDB interaction, Next.js caching library, Daraz scraper, and Email sending using Nodemailer, to create a comprehensive product data management system featuring key features such as product scraping, price history updates, and personalized user engagement through email notifications.

---
# Key Features
* **Product Model Creation**: A Mongoose model is created to store product data with various fields related to pricing, ratings, reviews, and product information.
* **Pricing and Rating Features**: The product model includes fields for current and original prices, price history, discount rate, description, ratings and reviews counts, product quantity, stars, and out-of-stock status.
* **Scraping Functionality**: Five functions are exported to interact with a MongoDB database using Mongoose and a Next.js application, enabling scraping of product data from Daraz.
* **Email Notification Feature**: The code includes functionality to send email notifications to users for products they have shown interest in.
* **Product Retrieval and Filtering**: Functions are provided to retrieve a product by its ID, return all products stored in the database, and find similar products based on a given product's ID.
* **User Management**: A feature is included to add a user's email to a product's list of users and send a welcome email.

---
# Folder Structure
```sh
Daraz_Scraper/
├── app/
│   ├── favicon.ico
│   ├── products/
│   │   └── [id]/
│   │       └── page.tsx
│   ├── page.tsx
│   ├── api/
│   │   └── cron/
│   │       └── route.ts
│   ├── layout.tsx
│   └── globals.css
├── types/
│   └── index.ts
├── package.json
├── vercel.json
├── next.config.js
├── README.md
├── components/
│   ├── Modal.tsx
│   ├── HeroCarousel.tsx
│   ├── Navbar.tsx
│   ├── PriceInfoCard.tsx
│   ├── Searchbar.tsx
│   └── ProductCard.tsx
├── tsconfig.json
├── .gitignore
├── lib/
│   ├── scrapper/
│   │   └── index.ts
│   ├── nodemailer/
│   │   └── index.ts
│   ├── models/
│   │   └── product.model.ts
│   ├── action/
│   │   └── index.ts
│   ├── utils.ts
│   └── mongoose.ts
├── tailwind.config.ts
├── package-lock.json
├── public/
│   ├── demo/
│   │   ├── track-product.png
│   │   ├── home-page.png
│   │   └── product-page.png
│   ├── vercel.svg
│   ├── assets/
│   │   ├── icons/
│   │   │   ├── check.svg
│   │   │   ├── arrow-right.svg
│   │   │   ├── user.svg
│   │   │   ├── comment.svg
│   │   │   ├── star.svg
│   │   │   ├── x-close.svg
│   │   │   ├── bookmark.svg
│   │   │   ├── chart.svg
│   │   │   ├── share.svg
│   │   │   ├── black-heart.svg
│   │   │   ├── price-tag.svg
│   │   │   ├── hand-drawn-arrow.svg
│   │   │   ├── mail.svg
│   │   │   ├── red-heart.svg
│   │   │   ├── search.svg
│   │   │   ├── logo.svg
│   │   │   ├── bag.svg
│   │   │   ├── chevron-down.svg
│   │   │   ├── arrow-down.svg
│   │   │   ├── square.svg
│   │   │   ├── arrow-up.svg
│   │   │   └── frame.svg
│   │   └── images/
│   │       ├── hero-5.svg
│   │       ├── hero-3.svg
│   │       ├── trending.svg
│   │       ├── hero-4.svg
│   │       ├── details.svg
│   │       ├── hero-1.svg
│   │       └── hero-2.svg
│   └── next.svg
├── .env.local.copy
└── postcss.config.js

17 directories, 63 files
```

---
# Getting Started
Welcome to Daraz Scraper! This guide will walk you through the process of setting up the project on your local machine.

## Prerequisites
Before installing the project, make sure you have:

* Node.js (14.x or higher) installed on your system. You can download it from [here](https://nodejs.org/en/download/).
* A code editor or IDE (Integrated Development Environment) of your choice.
* Git installed on your system to clone the repository.

## Installation

### Step 1: Clone the Repository
Clone the Daraz Scraper repository using the following command:
```bash
git clone https://github.com/Eemayas/Daraz_Scraper.git
```
Change into the project directory:
```bash
cd Daraz_Scraper
```

### Step 2: Install Dependencies

In your terminal, run the following command to install all dependencies required by the project:
```bash
npm install
```
or if you're using yarn:
```bash
yarn install
```

### Step 3: Configure Environment Variables

Create a new file named `.env` in the root directory of your project and add the following variables:

* `MONGO_URI`: Your MongoDB connection string.
* `NEXT_PUBLIC_API_KEY`: Your API key for Daraz.
* `EMAIL_HOST`: Your email host (e.g., Gmail).
* `EMAIL_PORT`: Your email port.
* `EMAIL_USER`: Your email username.
* `EMAIL_PASSWORD`: Your email password.

Example:
```makefile
MONGO_URI=mongodb://localhost:27017/
NEXT_PUBLIC_API_KEY=your-api-key-here
EMAIL_HOST=gmail.com
EMAIL_PORT=587
EMAIL_USER=your-email-username
EMAIL_PASSWORD=your-email-password
```
### Step 4: Run the Project

In your terminal, run the following command to start the Next.js development server:
```bash
npm run dev
```

or if you're using yarn:
```bash
yarn dev
```
Your project is now running on [http://localhost:3000](http://localhost:3000).

## Running the Project

To run the project, follow these steps:

1. Run `npm start` or `yarn start` to start the development server.
2. Open your web browser and navigate to [http://localhost:3000](http://localhost:3000).
3. You can now interact with the project's API endpoints using tools like Postman.

## Tests
To run tests, follow these steps:

1. Run `npm test` or `yarn test` to execute all unit tests.
2. If you want to run specific tests, use the following command: `jest <test-file-name>` (e.g., `jest api/cron/route.test.ts`).

## Troubleshooting

### Common Issues and Solutions

* **npm install** hangs:
	+ Solution: Run `npx npm-force-cache && npm install`.
* **Can't find variable: process**:
	+ Solution: Make sure you're running the command in a Node.js environment (e.g., run `node -v` to check).
* **TypeError: Cannot read property 'length' of undefined**:
	+ Solution: Check your MongoDB connection string and ensure it's correct.

If you encounter any other issues, feel free to ask on [Stack Overflow](https://stackoverflow.com/questions/tagged/daraz-scraper).

---
# API Reference
#### Get product by ID and similar products

```http
  GET /api/product/{id}
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `{id}`    | `string` | **Required**. The product ID |

Note: There is no API reference for getting all similar products, only a specific product by ID and its similar products.
#### Update products and send email notifications

```http
POST /api/products/update-and-send-email-notifications
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. The URL of the product to update            |
| `users`    | `array`  | **Optional**. An array of user emails to send email notifications to |

Note: There is only one API endpoint in this code, and it's a POST request to `/api/products/update-and-send-email-notifications`.
#### Add user email to product

```http
  POST /api/product/{productId}/add-email
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `email`   | `string` | **Required**. Email address of the user |
| `productId`| `string`| **Required**. ID of the product |

Note: The `/api/product/{productId}/add-email` endpoint is not a standard API endpoint, but rather a custom endpoint created by the `addUserEmailToProduct` function in the code.
#### Scrape and store product information from Daraz URL

```http
POST /api/scrapeProduct
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. The Daraz product URL to scrape |
#### Scrape Daraz Product Information

```http
GET /api/scrape-daraz-product
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. URL of the product to scrape |
| `username` | `string` | **Required**. Bright Data username |
| `password` | `string` | **Required**. Bright Data password |
| `port`     | `integer`| **Required**. Proxy port number |
#### Product Operations

```http
POST /api/products/scrapeAndStoreProduct
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productUrl` | `string` | **Required**. The URL of the product to scrape and store |

---

#### Get Product by ID

```http
GET /api/products/getProductById/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to retrieve |

---

#### Get All Products

```http
GET /api/products/getAllProducts
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `limit`   | `integer`| **Optional**. Limit the number of products (default: 10) |

---

#### Get Similar Products

```http
GET /api/products/getSimilarProducts/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to find similar products for |

---

#### Add User Email to Product

```http
POST /api/products/addUserEmailToProduct/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to add a user email to |
| `userEmail`  | `string` | **Required**. The email address of the user to add |

---

#### Note:
The code provided contains API endpoints for performing various operations related to products, such as scraping and storing products, retrieving products by ID or all products, finding similar products, and adding user emails to products.
#### Get product by ID and similar products

```http
  GET /api/product/{id}
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `{id}`    | `string` | **Required**. The product ID |

Note: There is no API reference for getting all similar products, only a specific product by ID and its similar products.
#### Update products and send email notifications

```http
POST /api/products/update-and-send-email-notifications
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. The URL of the product to update            |
| `users`    | `array`  | **Optional**. An array of user emails to send email notifications to |

Note: There is only one API endpoint in this code, and it's a POST request to `/api/products/update-and-send-email-notifications`.
#### Add user email to product

```http
  POST /api/product/{productId}/add-email
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `email`   | `string` | **Required**. Email address of the user |
| `productId`| `string`| **Required**. ID of the product |

Note: The `/api/product/{productId}/add-email` endpoint is not a standard API endpoint, but rather a custom endpoint created by the `addUserEmailToProduct` function in the code.
#### Scrape and store product information from Daraz URL

```http
POST /api/scrapeProduct
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. The Daraz product URL to scrape |
#### Scrape Daraz Product Information

```http
GET /api/scrape-daraz-product
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `url`      | `string` | **Required**. URL of the product to scrape |
| `username` | `string` | **Required**. Bright Data username |
| `password` | `string` | **Required**. Bright Data password |
| `port`     | `integer`| **Required**. Proxy port number |
#### Product Operations

```http
POST /api/products/scrapeAndStoreProduct
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productUrl` | `string` | **Required**. The URL of the product to scrape and store |

---

#### Get Product by ID

```http
GET /api/products/getProductById/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to retrieve |

---

#### Get All Products

```http
GET /api/products/getAllProducts
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `limit`   | `integer`| **Optional**. Limit the number of products (default: 10) |

---

#### Get Similar Products

```http
GET /api/products/getSimilarProducts/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to find similar products for |

---

#### Add User Email to Product

```http
POST /api/products/addUserEmailToProduct/:productId
```

| Parameter | Type     | Description                |
| :-------- | :------- | :------------------------- |
| `productId` | `string` | **Required**. The ID of the product to add a user email to |
| `userEmail`  | `string` | **Required**. The email address of the user to add |

---

#### Note:
The code provided contains API endpoints for performing various operations related to products, such as scraping and storing products, retrieving products by ID or all products, finding similar products, and adding user emails to products.


---

# Contributing

Contributions are welcome! Here are several ways you can contribute:

- **[Submit Pull Requests](https://github.com/Eemayas/Daraz_Scraper/pulls)**: Review open PRs, and submit your own PRs.
- **[Join the Discussions](https://github.com/Eemayas/Daraz_Scraper/discussions)**: Share your insights, provide feedback, or ask questions.
- **[Report Issues](https://github.com/Eemayas/Daraz_Scraper/issues)**: Submit bugs found or log feature requests for Daraz_Scraper.

### Contributing Guidelines

1. **Fork the Repository**: 
  - Start by forking the project repository to your GitHub account.
2. **Clone the Repository**:
  - Clone your forked repository to your local machine using the command:
  ```sh
  git clone https://github.com/your-username/Daraz_Scraper.git
  ```
  - Replace your-username and repository-name with your GitHub username.
4. **Create a New Branch**:
  - Create a new branch for your changes using the command:
  ```sh
  git checkout -b your-branch-name
  ```
5. **Make Your Changes**: 
  - Edit, add, or delete files as needed. Ensure your changes align with the project's contribution guidelines.
6. **Commit Your Changes**: Commit with a clear message describing your updates.
  - Stage your changes and commit them with a descriptive message:
    ```bash
    git add .
    git commit -m "Your descriptive message"
    ```
7. **Push Your Changes:**
  - Push your branch to your forked repository:
    ```bash
    git push origin your-branch-name
    ```
8. **Create a Pull Request (PR):**
  - Go to the original repository on GitHub and click “Compare & pull request.” Provide a clear description of the changes and submit the PR.

Once your PR is reviewed and approved, it will be merged into the main branch.
        

---

# License

This project is licensed under the MIT License - see the [LICENSE](./LICENSE) file for details.



---


In [189]:
# Specify the file name
file_name = "Daraz_Scraper_Installation_Guide.md"

# Open the file in write mode with utf-8 encoding and save the content
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(str(combined_markdown))

print(f"{file_name} has been created and saved.")

Daraz_Scraper_Installation_Guide.md has been created and saved.
