In [1]:
import os
from dotenv import load_dotenv
load_dotenv()


groq_api_key=os.getenv("GROQ_API_KEY")
groq_api_key

'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'

In [2]:
from langchain_groq import ChatGroq
model=ChatGroq(model="Gemma2-9b-It",groq_api_key=groq_api_key)
model

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x00000256F91A66E0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x00000256F91D00A0>, model_name='Gemma2-9b-It', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [3]:
with open("transcript.txt","r") as file:
    transcript_content=file.read() 

## Document: Internal Data Load

In [4]:
from langchain_community.document_loaders import WikipediaLoader
docs = WikipediaLoader(query="Data Analytics in Finance", load_max_docs=5).load()
len(docs)
print(docs)

[Document(metadata={'title': 'Analytics', 'summary': 'Analytics is the systematic computational analysis of data or statistics. It is used for the discovery, interpretation, and communication of meaningful patterns in data, which also falls under and directly relates to the umbrella term, data science. Analytics also entails applying data patterns toward effective decision-making. It can be valuable in areas rich with recorded information; analytics relies on the simultaneous application of statistics, computer programming, and operations research to quantify performance.\nOrganizations may apply analytics to business data to describe, predict, and improve business performance. Specifically, areas within analytics include descriptive analytics, diagnostic analytics, predictive analytics, prescriptive analytics, and cognitive analytics. Analytics may apply to a variety of fields such as marketing, management, finance, online systems, information security, and software services. Since an

In [5]:
docs

[Document(metadata={'title': 'Analytics', 'summary': 'Analytics is the systematic computational analysis of data or statistics. It is used for the discovery, interpretation, and communication of meaningful patterns in data, which also falls under and directly relates to the umbrella term, data science. Analytics also entails applying data patterns toward effective decision-making. It can be valuable in areas rich with recorded information; analytics relies on the simultaneous application of statistics, computer programming, and operations research to quantify performance.\nOrganizations may apply analytics to business data to describe, predict, and improve business performance. Specifically, areas within analytics include descriptive analytics, diagnostic analytics, predictive analytics, prescriptive analytics, and cognitive analytics. Analytics may apply to a variety of fields such as marketing, management, finance, online systems, information security, and software services. Since an

## Embeddings

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


## Vector Store

In [7]:
from langchain_chroma import Chroma

vectorstore=Chroma.from_documents(docs,embedding=embeddings)
vectorstore

<langchain_chroma.vectorstores.Chroma at 0x256f91d0af0>

## Retreiver

In [8]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda

retriever=vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k":1}
)


## Message History

In [9]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

#ChatMessageHistory: Keeps track and stored the chat messages exchanged between user and AI model
#BasechatHistory: It is like a Template/Abstract that defines the structure of history management. ChatMessageHistory builds on top of this template
#RunnableWithMessageHistory: This Wraps chat like a chatbot and save chat history for future conversations making sure past conversations influence the responses.

store={}


def get_session_history(session_id:str)->BaseChatMessageHistory:
    if session_id not in store:
        store[session_id]=ChatMessageHistory()
    return store[session_id]

with_message_history=RunnableWithMessageHistory(model,get_session_history)

In [10]:
config={"configurable":{"session_id":"chat1"}}

In [11]:
from langchain_core.messages import HumanMessage

response=with_message_history.invoke(
    [HumanMessage(content="Hi , My name is Akilesh and I am a Chief AI Engineer")],
    config=config
)

In [12]:
response.content

"Hello Akilesh! It's a pleasure to meet you.\n\nBeing a Chief AI Engineer is a fascinating role. What kind of projects are you working on these days? \n\nI'm eager to learn more about your work in the field of AI.\n"

## Prompt Template

In [13]:
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant specialing in Data Analysis. Answer all questions to the best of your ability in. There is a meeting of the Data Analysis team of Company Northeastern and there are several discussions help. The meeting Transcripts are recorded and understand the {transcript} and answer the Questions of the employee",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

chain = prompt | model

In [14]:
with_message_history=RunnableWithMessageHistory(
    chain,
    get_session_history,
    input_messages_key="messages"
)

In [15]:
employee="Akilesh"
designation="Junior Data Analyst"

config = {"configurable": {"session_id": "chat1"}}
repsonse=with_message_history.invoke(
    {'messages': [HumanMessage(content=f"""Can you summarize the meeting for {employee} who is {designation}.
                               Based on the designation refine the most important points that refine 80% essence of the conversation refined to the 
                               designation and what they will need as takeaways from the meeting. So keeping these factors generate a summary in  in 100 words
                               """)],"transcript":transcript_content},
    config=config
)
repsonse.content

"Hi Akilesh, \n\nHere's a quick summary of today's meeting focusing on your role:\n\n- **User Engagement Dashboard Delay:**  The team is working on a real-time dashboard, but data aggregation issues are causing a delay. Your focus should be on de-duplicating datasets and ensuring data integrity between Salesforce and app data.  \n\n- **Client Churn Analysis:** Your work on the churn prediction model is progressing well (85% accuracy!).  Focus on refining features by testing their correlation with churn and using techniques like SHAP values to prioritize the most impactful ones.  \n\n- **Data Alerts System:**  We're tweaking the real-time alert system for data anomalies. Be prepared to help refine thresholds and filters to reduce false positives. \n\n\nLet me know if you have any questions! \n\n"

### Chatbot: Continue chatting

In [16]:
response = with_message_history.invoke(
    {"messages": [HumanMessage(content="What all tests I should do before before pushing them to production")], "transcript":transcript_content},
    config=config,
)

response.content

"That's a great question, Akilesh!  Before pushing your data pipeline changes and churn analysis model to production, you should absolutely conduct thorough testing. Here's a breakdown:\n\n**Data Pipeline Tests:**\n\n* **Unit Tests:** Test individual components (like data extraction, transformation, and loading functions) in isolation. Make sure each step works as expected with different input data scenarios.\n* **Integration Tests:** Verify how different parts of your pipeline work together. Test the flow of data from source to destination, checking for data consistency and transformations along the way.\n* **Data Volume Tests:**  Simulate high-volume data loads to ensure your pipeline can handle peak times without performance issues or errors.\n* **Error Handling Tests:**  Test how your pipeline handles invalid data, connection errors, and other potential issues. Make sure it gracefully recovers and logs errors appropriately.\n* **Regression Tests:** After making changes, run existin

## Basic RAG Chain

In [17]:
# Step 1: Load documents from Wikipedia
from langchain_community.document_loaders import WikipediaLoader
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# Load documents related to "Data Analytics in Finance" from Wikipedia
docs = WikipediaLoader(query="Data Analytics in Finance", load_max_docs=5).load()

# Step 2: Initialize the embeddings model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Step 3: Store the documents in a Chroma vector store using the embeddings
vectorstore = Chroma.from_documents(docs, embedding=embeddings)


In [18]:
# Step 4: Load the meeting transcript
with open("transcript.txt", "r") as file:
    transcript_content = file.read()

# Step 5: Define the prompt template for the initial summary generation
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant specializing in Data Analysis. Answer all questions to the best of your ability. There is a meeting of the Data Analysis team at Company Northeastern. The meeting transcripts are recorded. Understand the {transcript} and answer the questions of the employee.",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

# Step 6: Combine prompt with the model for generating the summary
chain = prompt | model


In [19]:
# Step 7: Set up session history using ChatMessageHistory and RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

# Store the session history in a dictionary
store = {}

# Define a function to get the session history
def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

# Wrap the chain with the history management
with_message_history = RunnableWithMessageHistory(chain, get_session_history, input_messages_key="messages")


In [20]:
# Step 8: Define the initial user prompt
employee = "Akilesh"
designation = "Junior Data Analyst"

# employee = "Elon"
# designation = "Vice President of BI Team"

config = {"configurable": {"session_id": "chat1"}}

# Step 9: Call the model to generate the initial summary based on the transcript
response = with_message_history.invoke(
    {
        'messages': [
            HumanMessage(content=f"""Can you summarize the meeting for {employee}, who is {designation}? 
                                     Based on the designation, refine the most important points and what 
                                     they will need as takeaways from the meeting. Generate a summary in 100 words."""
            )
        ],
        'transcript': transcript_content  # Passing the meeting transcript as context
    },
    config=config
)

# Print the generated summary
print(response.content)


Hey Akilesh!  Good news - we're making good progress on the user engagement dashboard and churn analysis.  

The main hurdle is data aggregation, taking longer than expected due to increased data volume. We're working on optimizing queries and potentially shifting some processing to Snowflake.  

For churn analysis, focus on refining the machine learning models, especially feature selection. Gopi suggested using correlation tests and SHAP values to identify the most impactful features. Keep refining the real-time data anomaly alert system and aim for a polished executive summary of churn findings by next week.  




In [21]:
# Step 8: Define the initial user prompt
employee = "Riya"
designation = "Junior Data Analyst"

# employee = "Elon"
# designation = "Vice President of BI Team"

config = {"configurable": {"session_id": "chat1"}}

# Step 9: Call the model to generate the initial summary based on the transcript
response = with_message_history.invoke(
    {
        'messages': [
            HumanMessage(content=f"""Can you summarize the meeting for {employee}, who is {designation}? 
                                     Based on the designation, refine the most important points and what 
                                     they will need as takeaways from the meeting. Generate a summary in 100 words."""
            )
        ],
        'transcript': transcript_content  # Passing the meeting transcript as context
    },
    config=config
)

# Print the generated summary
print(response.content)

Hi Riya!  The meeting focused on data issues impacting the user engagement dashboard and churn analysis.  

We need to tackle redundant data pulls in our ETL process to optimize performance.  Focus on ensuring consistent timestamps across datasets, especially Salesforce and app engagement data.  For churn analysis, explore the impact of payment history and support ticket data on model accuracy.

Gopi suggested using SHAP values to understand feature importance.  We're aiming for a clear executive summary of churn findings by next week, highlighting key insights and actionable recommendations.  





In [22]:
# Step 10: Define a function to retrieve relevant documents from the vector store
from langchain_core.runnables import RunnableLambda

# Create a retriever to fetch relevant documents
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# Define a function to retrieve documents and combine them into context
def retrieve_documents(query: str):
    retrieved_docs = retriever.get_relevant_documents(query)
    # Combine the content of the retrieved documents into a single string to pass as context
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])
    return context

# Create a Runnable that wraps this retriever
retriever_runnable = RunnableLambda(retrieve_documents)


## Chatbot for Akilesh

In [24]:
from operator import itemgetter

from langchain_core.runnables import RunnablePassthrough
# Step 11: Set up the new RAG chain with document retrieval and model generation
rag_chain = {
    "transcript": retriever_runnable,  # Retrieve relevant documents as 'transcript'
    "messages": RunnablePassthrough()  # Pass through the user's messages
} | prompt | model

# Step 12: Future query using document retrieval from vector store
user_query = "What all tests should I do before pushing them to production?"

# Retrieve relevant documents and pass them along with the user's query
response = with_message_history.invoke(
    {
        "messages": [HumanMessage(content=user_query)],
        "transcript": retrieve_documents(user_query)  # Fetch the relevant documents from the vector store
    },
    config=config
)

# Print the chatbot's response
print(response.content)


  retrieved_docs = retriever.get_relevant_documents(query)


Before pushing any code to production, it's crucial to conduct a thorough set of tests to ensure its quality, stability, and reliability. Here's a comprehensive checklist of tests to consider:

**1. Unit Tests:**

* **Functionality:** Test individual units or functions in isolation to verify they perform as expected.
* **Edge Cases:**  Test with boundary values, invalid inputs, and unexpected scenarios to ensure robustness.
* **Performance:** Measure the execution time and resource consumption of individual units under different loads.

**2. Integration Tests:**

* **Inter-Module Communication:** Test how different components of your system interact with each other.
* **Data Flow:** Verify that data is correctly passed between modules and systems.
* **Third-Party APIs:** Test interactions with external APIs or services.

**3. End-to-End (E2E) Tests:**

* **Workflows:** Simulate complete user journeys or business processes from start to finish.
* **System Behavior:** Test the overall sy

## Chatbot for Elon

In [25]:
# Step 11: Set up the new RAG chain with document retrieval and model generation
rag_chain = {
    "transcript": retriever_runnable,  # Retrieve relevant documents as 'transcript'
    "messages": RunnablePassthrough()  # Pass through the user's messages
} | prompt | model

# Step 12: Future query using document retrieval from vector store
user_query = "What all are the key metrics on the projects progress that I can discuss with management"

# Retrieve relevant documents and pass them along with the user's query
response = with_message_history.invoke(
    {
        "messages": [HumanMessage(content=user_query)],
        "transcript": retrieve_documents(user_query)  # Fetch the relevant documents from the vector store
    },
    config=config
)

# Print the chatbot's response
print(response.content)


Here's a breakdown of key project progress metrics you can discuss with management, tailored for different aspects of the project:

**1. Completion Status & Timeline:**

* **Percentage Complete:**  A straightforward measure of overall progress, expressed as a percentage.
* **Tasks Completed:** Count of completed tasks against the total planned tasks.
* **Timeline Adherence:**  
    *  Are you on schedule? 
    *  Are there any delays, and what are their causes?
* **Burn Rate:** Tracks the rate at which you're spending resources (time, budget) over time. Useful for forecasting future needs.

**2. Deliverables & Quality:**

* **Deliverables Completed:** List of tangible outputs (reports, code, prototypes, etc.) delivered on time and to spec.
* **Defect Rate:** Number of defects found divided by the total number of units tested (e.g., lines of code, features).
* **Test Coverage:** Percentage of code or functionality covered by automated tests.

**3. Team Performance & Collaboration:**

* 