In [1]:
import os 
import requests
from bs4 import BeautifulSoup
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader

import re




In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
my_activeloop_org_id = "charanvardhan"
my_activeloop_dataset_name = "VoiceAssistant-embeddings"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [4]:
def get_documents_url(url):
    return [
    '/docs/huggingface_hub/guides/overview',
    '/docs/huggingface_hub/guides/download',
    '/docs/huggingface_hub/guides/upload',
    '/docs/huggingface_hub/guides/hf_file_system',
    '/docs/huggingface_hub/guides/repository',
    '/docs/huggingface_hub/guides/search',
    ]

def construct_url(base_url, relative_paath):
    return base_url + relative_paath

In [5]:
def scrape_page_content(url):

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Extract the desired content from the page 
    text = soup.body.text.strip()

    # remove none ascii characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

def scrape_all_content(base_url, relative_paths, 
                       filename):
    all_text = []
    for path in relative_paths:
        url = construct_url(base_url, path)
        text = scrape_page_content(url)
        all_text.append(text)
    

    with open(filename, 'w', encoding='utf-8') as f:
        for item in all_text:
            f.write(item + '\n')
        
    return all_text

In [6]:
# load and splitting the text
# def load_docs(root_dir, filename):
#     docs = []
#     try:
#         loader = TextLoader(os.path.join(root_dir, filename, encoding='utf-8'))
#         docs.extend(loader.load_and_split())
    
#     except Exception as e:
#         pass
    
#     return docs

def load_docs(root_dir, filename):
    docs = []
    try:
        file_path = os.path.join(root_dir, filename)
        loader = TextLoader(file_path, encoding='utf-8')  # ✅ fixed encoding
        docs = loader.load()  # ✅ use .load()
    except Exception as e:
        print(f"Error loading documents: {e}")  # ✅ helpful error message
    return docs

def split_docs(docs):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    return text_splitter.split_documents(docs)

In [7]:
def main():
    base_url = 'https://huggingface.co'

    filename = 'content.txt'
    root_dir = './'
    relative_paths = get_documents_url(base_url)
    content = scrape_all_content(base_url, relative_paths, filename)
    docs = load_docs(root_dir, filename)

    texts = split_docs(docs)

    # db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)
    # db.add_documents(texts)
    os.remove(filename)
    

In [8]:
if __name__ == "__main__":
    main()

In [9]:
import openai
import streamlit as st
from audio_recorder_streamlit import audio_recorder
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from elevenlabs import generate
from streamlit_chat import message

In [10]:
TempAudioFile = "temp.wav"
audioformat = "audio/wav"



In [11]:
def load_embeddings(dataset_path):
    embeddings = OpenAIEmbeddings()
    db = DeepLake(dataset_path=dataset_path, read_only=True, embedding_function=embeddings)

    return db



In [12]:
def transcribe_audio_to_text(audio_file):
    try:
        with open(audio_file, "rb") as f:
            response = openai.Audio.transcribe(
                model="whisper-1",
                file=f,
                response_format="text"
            )

        return response["text"]
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return None


In [18]:
def record_and_transcribe():
    audioBytes = audio_recorder()
    transcription = None
    if audioBytes:
        st.audio(audioBytes, format=audioformat)
        
        with open(TempAudioFile, "wb") as f:
            f.write(audioBytes)

        if st.button("Transcribe"):
            transcription = transcribe_audio_to_text(TempAudioFile)
            os.remove(TempAudioFile)
            display_transcription(transcription)

    return transcription

In [14]:
def display_transcription(transcription):
    if transcription:
        st.write(f"transaction: {transcription}")
        with open("audioTranscriopt.txt", "w+") as f:
            f.write(transcription)
    else:
        st.write("error transcribing audio")


def get_user_text(transcription):
    return st.text_input("", value=transcription if transcription else "", key=input)



In [15]:
def search_db(user_input, db):
    print(user_input)
    retriever = db.as_retriever()
    retriever.search_kwargs['distance_metric'] = 'cos'
    retriever.search_kwargs['fetch_k'] = 100
    retriever.search_kwargs['maximal_marginal_relevance'] = True
    retriever.search_kwargs['k'] = 10
    model = ChatOpenAI(model='gpt-3.5-turbo')
    qa = RetrievalQA.from_llm(model, retriever=retriever, return_source_documents=True)
    return qa({'query': user_input})

In [16]:
def display_conversation(history):

    for i in range(len(history['generated'])):
        message(history['past'][i], is_user=True, key=str(i) + "_user")
        message(history["generated"][i], key=str(i))

        voice = "Bella"
        text= history["generated"][i]
        audio = generate(text=text, voice=voice)
        st.audio(audio, format='audio/mp3')

In [19]:
# Main function to run the app
def main():
    # Initialize Streamlit app with a title
    st.write("# JarvisBase 🧙")
   
    # Load embeddings and the DeepLake database
    db = load_embeddings(dataset_path)

    # Record and transcribe audio
    transcription = record_and_transcribe()

    # Get user input from text input or audio transcription
    user_input = get_user_text(transcription)

    # Initialize session state for generated responses and past messages
    if "generated" not in st.session_state:
        st.session_state["generated"] = ["I am ready to help you"]
    if "past" not in st.session_state:
        st.session_state["past"] = ["Hey there!"]
        
    # Search the database for a response based on user input and update session state
    if user_input:
        output = search_db(user_input, db)
        print(output['source_documents'])
        st.session_state.past.append(user_input)
        response = str(output["result"])
        st.session_state.generated.append(response)

    # Display conversation history using Streamlit messages
    if st.session_state["generated"]:
        display_conversation(st.session_state)

# Run the main function when the script is executed
if __name__ == "__main__":
    main()

Deep Lake Dataset in hub://charanvardhan/VoiceAssistant-embeddings already exists, loading from the storage


2025-04-12 01:48:30.644 `label` got an empty value. This is discouraged for accessibility reasons and may be disallowed in the future by raising an exception. Please provide a non-empty label and hide it with label_visibility if needed.
2025-04-12 01:48:30.645 Session state does not function when running a script without `streamlit run`


KeyError: 'st.session_state has no key "generated". Did you forget to initialize it? More info: https://docs.streamlit.io/library/advanced-features/session-state#initialization'