<a href="https://colab.research.google.com/github/Fatimahme/Fatimahme/blob/main/SpaceRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2
import os
import pickle
import shutil
from git import Repo
from datetime import datetime
from pathlib import Path
import PyPDF2
import pandas as pd

# 📌 مسیرها
REPO_URL = "https://github.com/nasa/cFS"
LOCAL_REPO_PATH = "./cfs_repo"
DATA_FILE = "./cfs_data.pkl"

# 🌟 فرمت‌های مورد نظر
SUPPORTED_EXTENSIONS = ['.pdf', '.csv', '.xlsx', '.xls', '.py', '.md', '.txt']

def extract_content(file_path: str, ext: str):
    try:
        if ext == '.pdf':
            with open(file_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                return "\n".join(page.extract_text() or '' for page in reader.pages)

        elif ext in ['.csv', '.xls', '.xlsx']:
            df = pd.read_excel(file_path) if ext.startswith('.xls') else pd.read_csv(file_path)
            return df.to_csv(index=False)

        elif ext in ['.py', '.txt', '.md']:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()

    except Exception as e:
        return f"[Error reading {file_path}: {e}]"
    return ""

# 🧠 استخراج داده‌ها از ریپازیتوری
def extract_repo_data(repo_path):
    extracted = []
    uid = 1

    for root, _, files in os.walk(repo_path):
        for fname in files:
            ext = Path(fname).suffix.lower()
            if ext in SUPPORTED_EXTENSIONS:
                fpath = os.path.join(root, fname)
                content = extract_content(fpath, ext)
                extracted.append({
                    "unique_id": uid,
                    "file_path": os.path.relpath(fpath, repo_path),
                    "extension": ext,
                    "content": content[:20000],  # محدودیت حجمی برای حافظه
                    "datetime": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                uid += 1

    return extracted

# 🔽 کلون ریپو (فقط اگر قبلاً کلون نشده باشه)
if not os.path.exists(LOCAL_REPO_PATH):
    print("🔄 کلون ریپازیتوری NASA cFS...")
    Repo.clone_from(REPO_URL, LOCAL_REPO_PATH)
    print("✅ کلون انجام شد.")

# 🧪 لود یا استخراج اطلاعات
if os.path.exists(DATA_FILE):
    with open(DATA_FILE, "rb") as f:
        repo_data = pickle.load(f)
    print("📁 داده‌های قبلی لود شدند.")
else:
    print("🔍 در حال استخراج فایل‌ها از ریپو...")
    repo_data = extract_repo_data(LOCAL_REPO_PATH)
    with open(DATA_FILE, "wb") as f:
        pickle.dump(repo_data, f)
    print("✅ استخراج انجام شد و ذخیره شد.")

# 📤 نمایش خلاصه داده‌ها
for entry in repo_data[:5]:  # فقط ۵ مورد اول
    print(f"\n📄 ID: {entry['unique_id']} | File: {entry['file_path']} | Type: {entry['extension']}")
    print(f"🕒 Time: {entry['datetime']}")
    print(f"🧾 Content (excerpt):\n{entry['content'][:300]}...\n")


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
🔄 کلون ریپازیتوری NASA cFS...
✅ کلون انجام شد.
🔍 در حال استخراج فایل‌ها از ریپو...
✅ استخراج انجام شد و ذخیره شد.

📄 ID: 1 | File: CONTRIBUTING.md | Type: .md
🕒 Time: 2025-06-29 08:38:35
🧾 Content (excerpt):
# Core Flight System Contributing Guide

So you'd like to contribute to cFS? Below are some guidelines for contributors to follow. Contributions come in all shapes and sizes. We appreciate your help with documentation, unit tests, framework code, continuous-integration, or simply reporting bugs and ...


📄 ID: 2 | File: README.md | Type: .md
🕒 Time: 2025-06-29 08:38:35
🧾 Content (excerpt):
[![Build Linux](https://github.com/nasa/cfs/actions/workflows/build-

In [None]:
# !pip install PyPDF2
import os
import pickle
import shutil
from git import Repo
from datetime import datetime
from pathlib import Path
import PyPDF2
import pandas as pd

# 📌 مسیرها
REPO_URL = "https://github.com/nasa/cFE"
LOCAL_REPO_PATH = "./cfe_repo"
DATA_FILE = "./cfe_data.pkl"

# 🌟 فرمت‌های مورد نظر
SUPPORTED_EXTENSIONS = ['.pdf', '.csv', '.xlsx', '.xls', '.py', '.md', '.txt']

def extract_content(file_path: str, ext: str):
    try:
        if ext == '.pdf':
            with open(file_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                return "\n".join(page.extract_text() or '' for page in reader.pages)

        elif ext in ['.csv', '.xls', '.xlsx']:
            df = pd.read_excel(file_path) if ext.startswith('.xls') else pd.read_csv(file_path)
            return df.to_csv(index=False)

        elif ext in ['.py', '.txt', '.md']:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()

    except Exception as e:
        return f"[Error reading {file_path}: {e}]"
    return ""

# 🧠 استخراج داده‌ها از ریپازیتوری
def extract_repo_data(repo_path):
    extracted = []
    uid = 1

    for root, _, files in os.walk(repo_path):
        for fname in files:
            ext = Path(fname).suffix.lower()
            if ext in SUPPORTED_EXTENSIONS:
                fpath = os.path.join(root, fname)
                content = extract_content(fpath, ext)
                extracted.append({
                    "unique_id": uid,
                    "file_path": os.path.relpath(fpath, repo_path),
                    "extension": ext,
                    "content": content[:20000],  # محدودیت حجمی برای حافظه
                    "datetime": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                uid += 1

    return extracted

# 🔽 کلون ریپو (فقط اگر قبلاً کلون نشده باشه)
if not os.path.exists(LOCAL_REPO_PATH):
    print("🔄 کلون ریپازیتوری NASA cFE...")
    Repo.clone_from(REPO_URL, LOCAL_REPO_PATH)
    print("✅ کلون انجام شد.")

# 🧪 لود یا استخراج اطلاعات
if os.path.exists(DATA_FILE):
    with open(DATA_FILE, "rb") as f:
        repo_data = pickle.load(f)
    print("📁 داده‌های قبلی لود شدند.")
else:
    print("🔍 در حال استخراج فایل‌ها از ریپو...")
    repo_data = extract_repo_data(LOCAL_REPO_PATH)
    with open(DATA_FILE, "wb") as f:
        pickle.dump(repo_data, f)
    print("✅ استخراج انجام شد و ذخیره شد.")

# 📤 نمایش خلاصه داده‌ها
for entry in repo_data[:5]:  # فقط ۵ مورد اول
    print(f"\n📄 ID: {entry['unique_id']} | File: {entry['file_path']} | Type: {entry['extension']}")
    print(f"🕒 Time: {entry['datetime']}")
    print(f"🧾 Content (excerpt):\n{entry['content'][:300]}...\n")


🔄 کلون ریپازیتوری NASA cFE...
✅ کلون انجام شد.
🔍 در حال استخراج فایل‌ها از ریپو...
✅ استخراج انجام شد و ذخیره شد.

📄 ID: 1 | File: CHANGELOG.md | Type: .md
🕒 Time: 2025-06-29 08:41:01
🧾 Content (excerpt):
# Changelog

## Development Build: equuleus-rc1+dev251
- 'Fix #2651, initialize pipename buffer'
- See: <https://github.com/nasa/cFE/pull/2652>


## Development Build: equuleus-rc1+dev247
- 'Fix nasa/cFS#839, Update Workflows to Ubuntu 22.04'
- See: <https://github.com/nasa/cfe/pull/2648>


## Devel...


📄 ID: 2 | File: CONTRIBUTING.md | Type: .md
🕒 Time: 2025-06-29 08:41:01
🧾 Content (excerpt):
# Contributing Guide

Please see our [top-level contributing guide](https://github.com/nasa/cFS/blob/main/CONTRIBUTING.md) for more information on how to contribute. ...


📄 ID: 3 | File: README.md | Type: .md
🕒 Time: 2025-06-29 08:41:01
🧾 Content (excerpt):
![Static Analysis](https://github.com/nasa/cfe/workflows/Static%20Analysis/badge.svg)

# Core Flight System : Framework : Core Flight Exe

In [None]:
!pip install sentence-transformers langchain-community faiss-cpu
import pickle
import numpy as np
import faiss
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# 📂 مسیر فایل‌ها
CFS_DATA_FILE = "./cfs_data.pkl"
CFE_DATA_FILE = "./cfe_data.pkl"
FAISS_INDEX_PATH = "./cfs_faiss/index.faiss"
LOCAL_EMBEDDING_MODEL = "all-MiniLM-L6-v2"

# 1. لود داده‌ها از هر دو ریپو
with open(CFS_DATA_FILE, "rb") as f1, open(CFE_DATA_FILE, "rb") as f2:
    cfs_data = pickle.load(f1)
    cfe_data = pickle.load(f2)

# ترکیب دو مجموعه
repo_data = cfs_data + cfe_data
print(f"📄 تعداد کل فایل‌ها: {len(repo_data)}")

# 2. تقسیم محتواها به چانک
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = []
metadata = []

for doc in repo_data:
    chunks = text_splitter.split_text(doc["content"])
    for chunk in chunks:
        documents.append(chunk)
        metadata.append({
            "file_path": doc["file_path"],
            "extension": doc["extension"],
            "datetime": doc["datetime"],
            "unique_id": doc["unique_id"]
        })

print(f"✅ تعداد کل چانک‌ها: {len(documents)}")

# 3. ساخت embedding با SentenceTransformers
embedder = SentenceTransformer(LOCAL_EMBEDDING_MODEL)
embeddings = embedder.encode(documents, show_progress_bar=True)

# 4. ساخت و ذخیره FAISS Index
dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings, dtype="float32"))

os.makedirs(os.path.dirname(FAISS_INDEX_PATH), exist_ok=True)
faiss.write_index(index, FAISS_INDEX_PATH)
print("🎉 FAISS Index برای cFS + cFE ساخته و ذخیره شد.")


Collecting langchain-community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-se

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

🎉 FAISS Index برای cFS + cFE ساخته و ذخیره شد.


In [None]:
!pip install sentence-transformers langchain-community faiss-cpu

import pickle
from langchain_core.embeddings import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import os
from typing import List, Dict

# ⚙️ کلاس Embedding محلی
class LocalEmbeddings(Embeddings):
    def __init__(self):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts).tolist()
    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

# 📂 بارگذاری داده‌ها از cfs و cfe
with open("./cfs_data.pkl", "rb") as f1, open("./cfe_data.pkl", "rb") as f2:
    cfs_data = pickle.load(f1)
    cfe_data = pickle.load(f2)

# ترکیب داده‌ها
repo_data = cfs_data + cfe_data

# 🧩 تقسیم به چانک
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = []
metadatas = []

for item in repo_data:
    chunks = text_splitter.split_text(item["content"])
    for chunk in chunks:
        documents.append(chunk)
        metadatas.append({
            "file_path": item["file_path"],
            "extension": item["extension"],
            "datetime": item["datetime"],
            "unique_id": item["unique_id"]
        })

print(f"✅ تعداد چانک‌ها: {len(documents)}")

# 🔎 ایجاد / لود FAISS Index
embeddings = LocalEmbeddings()
index_path = "./faiss_index_cfs_cfe"

if os.path.exists(index_path):
    print("📦 لود ایندکس موجود")
    vectorstore = FAISS.load_local(
        index_path,
        embeddings,
        allow_dangerous_deserialization=True
    )
else:
    print("⚙️ ساخت ایندکس جدید")
    vectorstore = FAISS.from_texts(
        documents,
        embeddings,
        metadatas=metadatas
    )
    vectorstore.save_local(index_path)

# 🧠 کلاس جستجوگر برای گرفتن parent documents
class ParentDocumentRetriever:
    def __init__(self, vectorstore):
        self.vectorstore = vectorstore
    def get_parent_documents(self, query: str, top_k: int = 5) -> List[Dict]:
        results = self.vectorstore.similarity_search(query, k=top_k*2)
        seen_ids = set()
        return [
            doc.metadata
            for doc in results
            if not (doc.metadata["unique_id"] in seen_ids or seen_ids.add(doc.metadata["unique_id"]))
        ][:top_k]

# 🚀 تست با یک کوئری
retriever = ParentDocumentRetriever(vectorstore)
results = retriever.get_parent_documents("say every thin about AOCS according to documents", 5)
for i, doc in enumerate(results, 1):
    print(f"{i}. {doc['file_path']} (ID: {doc['unique_id']})")

✅ تعداد چانک‌ها: 636
⚙️ ساخت ایندکس جدید
1. docs/cFS_IdentifierNamingConvention.md (ID: 35)
2. docs/cFE_FunctionalRequirements.csv (ID: 34)
3. docs/cFE Application Developers Guide.md (ID: 33)
4. cFS_Framework_Corporate_CLA.pdf (ID: 6)


In [None]:
!pip install -q llama-cpp-python sentence-transformers langchain-community faiss-cpu

# ===== 1. کتابخانه‌ها =====
import os
import pickle
import textwrap
from typing import List
from llama_cpp import Llama
from IPython.display import Markdown, display
from langchain_core.embeddings import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

# ===== 2. مدل LLaMA =====
MODEL_URL = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
MODEL_PATH = "/content/llama-2-7b-chat.Q4_K_M.gguf"

if not os.path.exists(MODEL_PATH):
    print("📦 Downloading model...")
    !wget -q --show-progress -O {MODEL_PATH} {MODEL_URL}

# ===== 3. Embedding سفارشی =====
class LocalEmbeddings(Embeddings):
    def __init__(self):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer("all-MiniLM-L6-v2")

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

# ===== 4. کلاس LLM سفارشی با LLaMA =====
class ChatLLM:
    def __init__(self):
        self.llm = Llama(
            model_path=MODEL_PATH,
            n_ctx=2048,
            n_threads=8,
            n_gpu_layers=33,
            verbose=False
        )

    def generate_response(self, prompt: str, max_tokens=256):
        formatted_prompt = f"""<s>[INST] <<SYS>>
You are a helpful banking expert. Answer concisely using ONLY the provided context.
<</SYS>>
Context:
{prompt} [/INST]"""
        response = self.llm(
            formatted_prompt,
            max_tokens=max_tokens,
            temperature=0.5,
            top_p=0.9,
            stop=["</s>", "INST"],
            echo=False
        )
        return response["choices"][0]["text"].strip()

# ===== 5. آماده‌سازی Embedding و Vectorstore =====
embeddings = LocalEmbeddings()
index_path = "./faiss_index_cfs_cfe"

if os.path.exists(index_path):
    print("📁 Loading existing FAISS index...")
    vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
else:
    raise FileNotFoundError("❌ FAISS index not found at the specified path.")

llm = ChatLLM()

# ===== 6. تابع RAG نهایی =====
def get_clean_answer(question: str, top_k=5):
    # Retrieve documents
    docs = vectorstore.similarity_search(question, k=top_k)
    if not docs:
        return "❌ No relevant documents found."

    # Prepare context from docs
    context = "\n\n".join(f"--- Document {i+1} ---\n{doc.page_content}" for i, doc in enumerate(docs))

    # Prompt LLM
    full_prompt = f"{context}\n\nQuestion: {question}\nAnswer:"
    response = llm.generate_response(full_prompt)

    # Clean and display
    clean_response = response.split("Answer:")[-1].strip()
    wrapped = textwrap.fill(clean_response, width=80)

    display(Markdown(f"**❓ Question:** {question}"))
    display(Markdown(f"**✅ Answer:** {wrapped}"))

    return wrapped

# ===== 7. تست نهایی =====
print("✅ System Ready!\n")
get_clean_answer("Tell me what is the SB")

📁 Loading existing FAISS index...


In [None]:
!pip install -q llama-cpp-python sentence-transformers langchain-community faiss-cpu

import os
import textwrap
from typing import List
from llama_cpp import Llama
from IPython.display import Markdown, display
from langchain_core.embeddings import Embeddings
from langchain.vectorstores import FAISS

# ===== تنظیمات =====
MODEL_PATH = "/content/llama-2-7b-chat.Q4_K_M.gguf"
INDEX_PATH = "./faiss_index_cfs_cfe"

# ===== Embedding و LLM =====
class LocalEmbeddings(Embeddings):
    def __init__(self):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer("all-MiniLM-L6-v2")

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

class ChatLLM:
    def __init__(self):
        self.llm = Llama(
            model_path=MODEL_PATH,
            n_ctx=2048,
            n_threads=8,
            n_gpu_layers=33,
            verbose=False
        )

    def generate_response(self, prompt: str, max_tokens=256):
        formatted_prompt = f"""<s>[INST] <<SYS>>
You are a helpful banking expert. Answer concisely using ONLY the provided context.
<</SYS>>
Context:
{prompt} [/INST]"""
        response = self.llm(
            formatted_prompt,
            max_tokens=max_tokens,
            temperature=0.5,
            top_p=0.9,
            stop=["</s>", "INST"],
            echo=False
        )
        return response["choices"][0]["text"].strip()

# ===== بارگذاری ایندکس =====
embeddings = LocalEmbeddings()
vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
llm = ChatLLM()

# ===== تابع پاسخ‌دهی =====
def ask_rag(question: str, top_k=3):
    docs = vectorstore.similarity_search(question, k=top_k)
    if not docs:
        return "❌ No relevant documents found."

    context = "\n\n".join(
        f"--- File: {doc.metadata.get('file_name', 'Unknown')} ---\n{doc.page_content}"
        for doc in docs
    )

    prompt = f"{context}\n\nQuestion: {question}\nAnswer:"
    response = llm.generate_response(prompt)
    answer = response.split("Answer:")[-1].strip()
    wrapped = textwrap.fill(answer, width=80)

    display(Markdown(f"**❓ Question:** {question}"))
    display(Markdown("**📄 Files Used:**"))
    for doc in docs:
        display(Markdown(f"- {doc.metadata.get('file_name', 'Unknown')}"))
    display(Markdown(f"**✅ Answer:** {wrapped}"))

    return wrapped

# ===== اجرا =====
print("✅ System Ready (Quick Load)")
ask_rag("what is the most important thing about UPD?")


llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


✅ System Ready (Quick Load)


**❓ Question:** what is the most important thing about UPD?

**📄 Files Used:**

- Unknown

- Unknown

- Unknown

**✅ Answer:** As a helpful banking expert, the most important thing about UPD (User Parameters
Definition) is its ability to provide consistent and well-defined parameter
names, order, and lengths, which simplifies commanding and reduces errors during
I&T (Integration and Testing) processes.

'As a helpful banking expert, the most important thing about UPD (User Parameters\nDefinition) is its ability to provide consistent and well-defined parameter\nnames, order, and lengths, which simplifies commanding and reduces errors during\nI&T (Integration and Testing) processes.'