In [6]:
pwd

'c:\\Users\\PRATEEK G\\Desktop\\Music Chatbot'

# 1. DATA LOADER

In [None]:
import streamlit as st
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeEmbeddings, PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_openai import AzureChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

# Load environment variables
load_dotenv(override=True)

# Streamlit app
st.title("Hindustani Music Document Analysis")

# File uploader
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
    # Load the PDF
    loader = PyPDFLoader(uploaded_file)
    docs = loader.load()

    # Split the text
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    text_chunks = text_splitter.split_documents(docs)

    # Embedding options
    embedding_option = st.selectbox(
        "Choose an embedding method",
        ("Pinecone", "Hugging Face")
    )

    if embedding_option == "Pinecone":
        model_name = 'multilingual-e5-large'
        embeddings = PineconeEmbeddings(
            model=model_name,
            pinecone_api_key=os.environ.get('PINECONE_API_KEY')
        )
    else:
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Generate embeddings
    text_documents = [chunk.page_content for chunk in text_chunks]
    word_embeddings = embeddings.embed_documents(text_documents)

    # Vector store options
    vector_store_option = st.selectbox(
        "Choose a vector store",
        ("Pinecone", "Chroma")
    )

    if vector_store_option == "Pinecone":

        pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
        index_name = "musicbot"
        pc.create_index(
            name=index_name,
            dimension=384,
            metric="cosine",
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
        docsearch = PineconeVectorStore.from_documents(
            documents=text_chunks,
            index_name=index_name,
            embedding=embeddings,
        )
        db = docsearch
    else:
        db = Chroma.from_documents(
            documents=text_chunks,
            embedding=embeddings,
        )

    # Query input
    query = st.text_input("Enter your query:")
    if query:
        result = db.similarity_search(query)
        st.write(result)

    # Integrate with LLM
    retriever = db.as_retriever()
    llm = AzureChatOpenAI(
        azure_deployment="gpt-4o",  # or your deployment
        api_version="2023-09-01-preview",  # or your api version
        temperature=0.6,
    )
    retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
    combine_docs_chain = create_stuff_documents_chain(
        llm, retrieval_qa_chat_prompt
    )
    retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

    # Final result
    if query:
        response = retrieval_chain.invoke({"input": query})
        answer = response.get("answer", "I don't know the answer.")
        st.write(answer)

In [7]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("Data\Hindustani_Music_ThBook1_Eng.pdf")
docs = loader.load()

In [None]:
import os
from dotenv import load_dotenv

True

In [122]:
load_dotenv(override=True)

True

# 2. TEXT SPLITTING

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20
)

text_chunks = text_splitter.split_documents(docs)




AIMessage(content='Music is an art form and cultural expression that uses sound as its primary medium. It is created through the organization of elements like pitch (melody and harmony), rhythm (tempo, meter, and articulation), dynamics (loudness and softness), and timbre (tone color or quality). Music can be vocal, instrumental, or a combination of both, and it exists in a wide variety of styles and genres across different cultures and traditions.\n\nAt its core, music serves as a means of communication and emotional expression. It can evoke feelings, tell stories, inspire movement, foster connection, and provide a sense of identity or belonging. Music can be experienced individually or collectively, and it plays a significant role in rituals, celebrations, entertainment, education, and even therapy.\n\nFrom simple folk tunes to complex symphonies, music is a universal human phenomenon that transcends language and cultural barriers, yet it is deeply shaped by the context in which it i

# 3. WORD EMBEDDINGS

## a. Using Pinecone Embedding

In [133]:
from langchain_pinecone import PineconeEmbeddings
import os

model_name = 'multilingual-e5-large'
embeddings1 = PineconeEmbeddings(
    model=model_name,
    pinecone_api_key=os.environ.get('PINECONE_API_KEY')
)

# Extract text content from documents
text_documents = [chunk.page_content for chunk in text_chunks]
word_embeddings2 = embeddings1.embed_documents(text_documents)


## b. Using Hugging Face Embedding

In [134]:
from langchain_huggingface import HuggingFaceEmbeddings

# Load the model (No API key needed)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Generate embeddings for a document
text_documents = [chunk.page_content for chunk in text_chunks]
word_embeddings = embeddings.embed_documents(text_documents)


# CREATING VECTOR STORE

## Using Pinecone

### Initializing Vector Index

In [140]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
 

index_name = "musicbot"

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws', 
        region='us-east-1'
    )
)


### Uploading text Chunks

In [141]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

## Using Chroma

In [155]:
from langchain_chroma import Chroma

db = Chroma.from_documents(
    documents=text_chunks,
    embedding=embeddings,
)

In [156]:
## Vector database
query="What is the origin of Hindustani Music?"
result = db.similarity_search(query)
result

[Document(id='61f17a0e-1869-4d04-8e0b-63e5c60b01d1', metadata={'page': 23, 'page_label': '24', 'source': 'Data\\Hindustani_Music_ThBook1_Eng.pdf', 'text': '13\nHindustani Music\nNotes\nIntroduction of Hindustani Music\n17. creation – to make\n18. pronounce – to produce sound verbally\n19. mixed – combined\n20. elaboration – to spread\n21. omnipresent – forever present'}, page_content='13\nHindustani Music\nNotes\nIntroduction of Hindustani Music\n17. creation – to make\n18. pronounce – to produce sound verbally\n19. mixed – combined\n20. elaboration – to spread\n21. omnipresent – forever present'),
 Document(id='2bb7bb98-bd45-4cc6-89e1-8f2f502e9d86', metadata={'page': 98, 'page_label': '99', 'source': 'Data\\Hindustani_Music_ThBook1_Eng.pdf', 'text': 'Hindustani classical music among st masses 1. Re-establishing \nof theoretical aspect of Hindusta ni music and its co-ordination \nwith practical music, setting up of proper institutions for the \ngrowth and development of music.  \n2. Pu

# Intigrating with LLM

In [157]:
from langchain_openai import AzureChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub

retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
retriever=db.as_retriever()

llm = AzureChatOpenAI(
    azure_deployment="gpt-4o",  # or your deployment
    api_version="2023-09-01-preview",  # or your api version
    temperature=0.6,
)

combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)




# Final Result

In [159]:
response = retrieval_chain.invoke({"input": "Which are the oldest forms of Hindustani classical music which are still in vogue?"})
answer = response.get("answer", "I don't know the answer.")
print(answer)

The oldest forms of Hindustani classical music which are still in vogue are **Dhrupad** and **Dhamar**.
