**1. Get article from Google Scholar**

1.1 Collect pdfs

In [7]:
# !pip install scholarly

In [8]:
from scholarly import scholarly
import time

# def automated_collection(topic, interval_hours=24, max_results=10):
#     """
#     Collecte automatique d'articles sur un sujet donné depuis Google Scholar.
#     Args:
#         topic (str): Le sujet de recherche.
#         interval_hours (int): L'intervalle de temps entre chaque collecte (en heures).
#         max_results (int): Le nombre maximum d'articles à collecter.
#     """
#     while True:
#         articles = collect_articles(topic, max_results)
#         print(f"Articles collectés pour le sujet '{topic}':")
#         for article in articles:
#             print(article)
#         print(f"Attente de {interval_hours} heures avant la prochaine collecte...")
#         time.sleep(interval_hours * 3600)
        

def collect_articles(topic, max_results=10):
    """
    Recherche et collecte des articles sur un sujet donné depuis Google Scholar.
    Args:
        topic (str): Le sujet de recherche.
        max_results (int): Le nombre maximum d'articles à collecter.
    Returns:
        List[dict]: Une liste de dictionnaires contenant les articles collectés.
    """
    search_query = scholarly.search_pubs(topic)
    articles = []
    for i in range(max_results):
        try:
            article = next(search_query)
            articles.append({
                "title": article.get("bib", {}).get("title", "").replace(' ', '_').replace('/', '_'),
                "authors": article.get("bib", {}).get("author", []),
                "pub_year": article.get("bib", {}).get("pub_year", ""),
                "abstract": article.get("bib", {}).get("abstract", ""),
                "num_citations": article.get("num_citations", 0),
                "pub_url": article.get("pub_url", ""),
                "eprint_url": article.get("eprint_url", ""),
                "related_articles_url": article.get("url_related_articles", ""),
                "cited_by_url": article.get("citedby_url", ""),
                "scholarbib_url": article.get("url_scholarbib", ""),
            })
        except StopIteration:
            print("Fin des résultats.")
            break
        except Exception as e:
            print(f"Erreur lors de la récupération d'un article : {e}")
            continue

    return articles


topic = "Automatic Speech Recognition"
articles = collect_articles(topic, max_results=50)
articles

[{'title': 'Automatic_speech_recognition',
  'authors': ['D Yu', 'L Deng'],
  'pub_year': '2016',
  'abstract': 'Equally important is the development of the deep learning techniques in large vocabulary  continuous speech recognition (LVCSR) powered by big data and significantly increased',
  'num_citations': 1588,
  'pub_url': 'https://link.springer.com/content/pdf/10.1007/978-1-4471-5779-3.pdf',
  'eprint_url': 'https://www.academia.edu/download/59834621/automatic_speech_recognition_a_deep_learning_approach20190622-75570-hvrme8.pdf',
  'related_articles_url': '/scholar?q=related:gDjtGYwodk8J:scholar.google.com/&scioq=Automatic+Speech+Recognition&hl=en&as_sdt=0,33',
  'cited_by_url': '/scholar?cites=5725808558443673728&as_sdt=5,33&sciodt=0,33&hl=en',
  'scholarbib_url': '/scholar?hl=en&q=info:gDjtGYwodk8J:scholar.google.com/&output=cite&scirp=0&hl=en'},
 {'title': 'Automatic_speech_recognition_and_speech_variability:_A_review',
  'authors': ['M Benzeghiba', 'R De Mori', 'O Deroo', 'S D

1.2 Process to download

In [9]:
import requests
import os

def download_pdf(url, output_dir, filename):
    """
    Downloads a PDF from the provided URL.
    Args:
        url (str): The URL to the PDF.
        output_dir (str): The directory to save the PDF.
        filename (str): The filename for the PDF.
    Returns:
        str: Path to the saved PDF.
    """
    try:
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''):
            os.makedirs(output_dir, exist_ok=True)
            pdf_path = os.path.join(output_dir, filename)
            with open(pdf_path, 'wb') as pdf_file:
                pdf_file.write(response.content)
            print(f"Downloaded PDF: {pdf_path}")
            return pdf_path
        else:
            return None
    except Exception as e:
        print(f"Error downloading PDF from {url}: {e}")
        return None

for article in articles:
    if article["pub_url"]:
        filename = f"{article['title'].replace(' ', '_').replace('/', '_')}.pdf"
        download_pdf(article["pub_url"], output_dir="./pdfs", filename=filename)        


Downloaded PDF: ./pdfs/Literature_review_on_automatic_speech_recognition.pdf
Downloaded PDF: ./pdfs/Automatic_speech_recognition–a_brief_history_of_the_technology_development.pdf
Downloaded PDF: ./pdfs/Automatic_speech_recognition:_a_review.pdf
Downloaded PDF: ./pdfs/Recent_advances_in_end-to-end_automatic_speech_recognition.pdf
Downloaded PDF: ./pdfs/Automatic_Speech_Recognition_for_second_language_learning:_How_and_why_it_actually_works..pdf
Downloaded PDF: ./pdfs/Automatic_speech_recognition.pdf


**2. Extract content of pdfs**

In [10]:
# !pip install pdfplumber

In [11]:
import os
import pdfplumber

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF.
    Args:
        pdf_path (str): Path to the PDF file.
    Returns:
        str: Extracted text from the PDF.
    """
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text


output_dir = "./pdfs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

def update_articles_with_pdf_content(articles, pdf_dir="./pdfs"):
    """
    Updates the articles with content extracted from PDFs.
    Args:
        articles (list): List of article dictionaries.
        pdf_dir (str): Directory where the PDFs are stored.
    Returns:
        list: Articles updated with extracted content.
    """
    for article in articles:
        pdf_filename = f"{article['title']}.pdf"
        pdf_path = os.path.join(pdf_dir, pdf_filename)
        if os.path.exists(pdf_path):
            try:
                # Extract content from the PDF
                content = extract_text_from_pdf(pdf_path)
                article["content"] = content
            except Exception as e:
                print(f"Failed to extract content for {article['title']}: {e}")
                article["content"] = ""
        else:
            article["content"] = article["abstract"]

    return articles
articles = update_articles_with_pdf_content(articles, output_dir)


**3. Build embedding function with sentence-transformers/all-mpnet-base-v2**

In [12]:
# !pip install -qU "langchain-chroma>=0.1.2"

In [13]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from langchain_chroma import Chroma
from uuid import uuid4
from langchain_core.documents import Document

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
embedding_model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    clamp = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / clamp

class CustomEmbeddingFunction:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def embed_documents(self, texts):
        """
        Fonction utilisée pour générer des embeddings pour une liste de textes.
        """
        encoded_input = self.tokenizer(texts, padding=True, truncation=True, max_length=200, return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings.tolist()

    def embed_query(self, text):
        """
        Fonction utilisée pour générer un embedding pour une seule requête.
        """
        return self.embed_documents([text])[0]

def sanitize_metadata(metadata):
    """
    Convertit les valeurs complexes des métadonnées en chaînes compatibles avec ChromaDB.
    Args:
        metadata (dict): Métadonnées originales.
    Returns:
        dict: Métadonnées avec des types compatibles (str, int, float, bool).
    """
    sanitized_metadata = {}
    for key, value in metadata.items():
        # Si la valeur est une liste ou un type complexe, la convertir en chaîne
        if isinstance(value, (list, dict)):
            sanitized_metadata[key] = str(value)
        else:
            sanitized_metadata[key] = value
    return sanitized_metadata
embedding_function = CustomEmbeddingFunction(model=embedding_model, tokenizer=tokenizer)



**4. Spliting the text**

In [14]:
def chunk_text(text, max_length=500):
    """
    Splits text into chunks of a specified maximum length.
    Args:
        text (str): Text to split.
        max_length (int): Maximum chunk length.
    Returns:
        list: List of chunks.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length):
        chunks.append(" ".join(words[i:i + max_length]))
    return chunks


**5. Initiate and populate ChromaDB**

In [15]:
vector_store = Chroma(
    collection_name='articles',
    embedding_function=embedding_function,
    persist_directory='./chroma_langchain_db'
)


def index_chunks(articles, vector_store, max_chunk_length=500):
    """
    Indexes chunks of articles in the vector database.
    Args:
        articles (list): List of article dictionaries.
        vector_store: ChromaDB instance.
        max_chunk_length (int): Maximum chunk length.
    """
    
    documents = []
    for article in articles:
        chunks = chunk_text(article['content'], max_length=max_chunk_length)
        for chunk in chunks:
            document = Document(
                id=str(uuid4()),
                page_content=chunk,
                metadata={
                    "title": article["title"],
                    "authors": article["authors"],
                    "year": article.get("pub_year", ""),
                    "source_url": article.get("pub_url", "")
                }
            )
            documents.append(document)

    vector_store.add_documents(documents=documents)
    print(f"Indexed {len(documents)} chunks in ChromaDB.")




**6. Retrieve the most relevant chunks**

In [16]:
def query_vector_store(query, vector_store, k=5):
    """
    Queries the vector store to retrieve the most relevant chunks.
    Args:
        query (str): The user query.
        vector_store: ChromaDB instance.
        k (int): Number of results to retrieve.
    Returns:
        list: List of relevant documents (chunks).
    """
    retriever = vector_store.as_retriever()
    relevant_docs = retriever.get_relevant_documents(query, k=k)
    return relevant_docs

query = "What are the latest techniques in Automatic Speech Recognition?"
relevant_chunks = query_vector_store(query, vector_store)

for doc in relevant_chunks:
    print(f"Titre: {doc.metadata['title']}")
    if 'authors' in doc.metadata:
        print(f"Auteur(s): {doc.metadata['authors']}")
    else:
        print(f"Auteur(s): {doc.metadata['author']}")
    print(f"Extrait: {doc.page_content[:200]}...")
    if 'eprint_url' in doc.metadata:
        print(f"Source: {doc.metadata['eprint_url']}")
    if 'pub_url' in doc.metadata:
        print(f"Source: {doc.metadata['pub_url']}")
    print("-" * 80)


  relevant_docs = retriever.get_relevant_documents(query, k=k)


**7. Make context and generate response**

**7.1 Build Context**

In [17]:
import subprocess
import json

def build_context(relevant_chunks):
    """
    Builds a single context string from the relevant chunks.
    Args:
        relevant_chunks (list): List of retrieved chunks.
    Returns:
        str: Combined context string.
    """
    print(f" Building context from {relevant_chunks} ...")
    context = "\n\n".join([doc.page_content for doc in relevant_chunks])
    return context


**7.2 Extract Source**

In [18]:
def extract_sources(relevant_chunks):
    """
    Extracts sources from the metadata of retrieved chunks.
    Args:
        relevant_chunks (list): List of retrieved chunks.
    Returns:
        list: List of source URLs or titles.
    """
    sources = []
    for chunk in relevant_chunks:
        if "source_url" in chunk.metadata:
            sources.append(chunk.metadata["source_url"])
        elif "title" in chunk.metadata:
            sources.append(chunk.metadata["title"])
    return list(set(sources))  # Remove duplicates

def remove_duplicate_chunks(chunks):
    """
    Removes duplicate chunks based on their content.
    Args:
        chunks (list): List of retrieved chunks.
    Returns:
        list: Filtered list of unique chunks.
    """
    seen = set()
    unique_chunks = []
    for chunk in chunks:
        if chunk.page_content not in seen:
            unique_chunks.append(chunk)
            seen.add(chunk.page_content)
    return unique_chunks


**7.3 Custom prompt**

In [19]:
import tensorflow as tf
print(tf.__version__)

# python version
import sys
print(sys.version)


2.12.0
3.11.5 (main, Sep 11 2023, 13:54:46) [GCC 11.2.0]


In [20]:

def build_custom_prompt_with_sources(query, context, sources):
    """
    Builds a custom prompt to guide the LLM to include sources in its response.
    Args:
        query (str): The user query.
        context (str): The retrieved context.
        sources (list): List of sources related to the context.
    Returns:
        str: The custom prompt.
    """
    sources_text = "\n".join([f"- {source}" for source in sources])

    prompt = f"""
    You are an expert assistant in Automatic Speech Recognition (ASR) with deep knowledge of the latest 
    machine learning techniques and their applications.

    Below, I will provide you with some **context** extracted from relevant documents, along with their **sources**. 
    Use this context and these sources to answer the **question** provided.

    ### Context:
    {context}

    ### Sources:
    {sources_text}

    ### Question:
    {query}

    ### Instructions:
    - Use the context and refer to the sources when providing your answer.
    - If you mention information derived from the context, attribute it to the corresponding source.
    - If the context is insufficient to answer the question, politely say, "I need more information to answer this question."
    - Format your response as follows:
      - Start with a short summary (1-2 sentences).
      - Provide detailed explanations in paragraphs or bullet points.
      - Include references to the sources (e.g., "According to [source]"). 

    ### Your Response:
    """
    return prompt



**7.4 Generate response**

In [21]:



import subprocess
import json


def generate_response_with_prompt(prompt):
    payload = {"model": "llama3.2", "prompt": prompt}
    json_payload = json.dumps(payload)
    curl_command = [
        "curl", "-X", "POST", "http://localhost:11434/api/generate",
        "-H", "Content-Type: application/json",
        "-d", json_payload
    ]
    result = subprocess.run(curl_command, capture_output=True, text=True)

    if result.returncode == 0:
        try:
            response_data = json.loads(result.stdout)
            return response_data.get("content", "No content returned")
        except json.JSONDecodeError:
            print("Error: Invalid JSON response")
            print("Raw Response:", result.stdout)
            return "The server returned an invalid response. Please try again later."
    else:
        print(f"API call failed: {result.stderr}")
        return f"Error: API call failed with exit code {result.returncode}."


# query = "What are the latest deep learning techniques in ASR?"
# relevant_chunks = query_vector_store(query, vector_store)
# context = build_context(relevant_chunks)

In [22]:
# Retrieve relevant chunks
query = "What are the latest deep learning techniques in ASR?"
relevant_chunks = query_vector_store(query, vector_store)
filtered_chunks = remove_duplicate_chunks(relevant_chunks)

# Build the context
context = build_context(filtered_chunks)

# Extract sources
sources = extract_sources(filtered_chunks)

# Build the custom prompt with sources
custom_prompt = build_custom_prompt_with_sources(query, context, sources)

# Print the prompt (for debugging or validation)
print("Custom Prompt with Sources:")
print(custom_prompt)


 Building context from [] ...
Custom Prompt with Sources:

    You are an expert assistant in Automatic Speech Recognition (ASR) with deep knowledge of the latest 
    machine learning techniques and their applications.

    Below, I will provide you with some **context** extracted from relevant documents, along with their **sources**. 
    Use this context and these sources to answer the **question** provided.

    ### Context:
    

    ### Sources:
    

    ### Question:
    What are the latest deep learning techniques in ASR?

    ### Instructions:
    - Use the context and refer to the sources when providing your answer.
    - If you mention information derived from the context, attribute it to the corresponding source.
    - If the context is insufficient to answer the question, politely say, "I need more information to answer this question."
    - Format your response as follows:
      - Start with a short summary (1-2 sentences).
      - Provide detailed explanations in paragr

In [23]:
response = generate_response_with_prompt(custom_prompt)

Error: Invalid JSON response
Raw Response: {"model":"llama3.2","created_at":"2024-11-23T11:03:31.646735973Z","response":"I","done":false}
{"model":"llama3.2","created_at":"2024-11-23T11:03:31.80510239Z","response":"'d","done":false}
{"model":"llama3.2","created_at":"2024-11-23T11:03:31.974969194Z","response":" be","done":false}
{"model":"llama3.2","created_at":"2024-11-23T11:03:32.131316247Z","response":" happy","done":false}
{"model":"llama3.2","created_at":"2024-11-23T11:03:32.294345208Z","response":" to","done":false}
{"model":"llama3.2","created_at":"2024-11-23T11:03:32.449567173Z","response":" help","done":false}
{"model":"llama3.2","created_at":"2024-11-23T11:03:32.605415332Z","response":" you","done":false}
{"model":"llama3.2","created_at":"2024-11-23T11:03:32.759710159Z","response":" with","done":false}
{"model":"llama3.2","created_at":"2024-11-23T11:03:32.916104307Z","response":" your","done":false}
{"model":"llama3.2","created_at":"2024-11-23T11:03:33.070902017Z","response":"

In [1]:
!streamlit run ../interface/main.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://10.41.144.189:8501[0m
[0m
2024-11-23 12:20:36.665934: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-23 12:20:36.676674: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732360836.689786  684975 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732360836.693697  684975 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for p