In [1]:
import os
from dotenv import load_dotenv
load_dotenv()


groq_api_key=os.getenv("GROQ_API_KEY")
groq_api_key

'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'

In [2]:
from langchain_groq import ChatGroq
model=ChatGroq(model="Gemma2-9b-It",groq_api_key=groq_api_key)
model

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x00000223724CA6E0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x00000223724F80A0>, model_name='Gemma2-9b-It', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [19]:
with open("transcript.txt","r") as file:
    transcript_content=file.read() 

## Document: Internal Data Load

In [27]:
from langchain_community.document_loaders import WikipediaLoader
docs = WikipediaLoader(query="Data Analytics in Finance", load_max_docs=5).load()
len(docs)
print(docs)

[Document(metadata={'title': 'Analytics', 'summary': 'Analytics is the systematic computational analysis of data or statistics. It is used for the discovery, interpretation, and communication of meaningful patterns in data, which also falls under and directly relates to the umbrella term, data science. Analytics also entails applying data patterns toward effective decision-making. It can be valuable in areas rich with recorded information; analytics relies on the simultaneous application of statistics, computer programming, and operations research to quantify performance.\nOrganizations may apply analytics to business data to describe, predict, and improve business performance. Specifically, areas within analytics include descriptive analytics, diagnostic analytics, predictive analytics, prescriptive analytics, and cognitive analytics. Analytics may apply to a variety of fields such as marketing, management, finance, online systems, information security, and software services. Since an

In [28]:
docs

[Document(metadata={'title': 'Analytics', 'summary': 'Analytics is the systematic computational analysis of data or statistics. It is used for the discovery, interpretation, and communication of meaningful patterns in data, which also falls under and directly relates to the umbrella term, data science. Analytics also entails applying data patterns toward effective decision-making. It can be valuable in areas rich with recorded information; analytics relies on the simultaneous application of statistics, computer programming, and operations research to quantify performance.\nOrganizations may apply analytics to business data to describe, predict, and improve business performance. Specifically, areas within analytics include descriptive analytics, diagnostic analytics, predictive analytics, prescriptive analytics, and cognitive analytics. Analytics may apply to a variety of fields such as marketing, management, finance, online systems, information security, and software services. Since an

## Embeddings

In [29]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


## Vector Store

In [30]:
from langchain_chroma import Chroma

vectorstore=Chroma.from_documents(docs,embedding=embeddings)
vectorstore

<langchain_chroma.vectorstores.Chroma at 0x22317c76ef0>

## Retreiver

In [31]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda

retriever=vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k":1}
)


## Message History

In [None]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

#ChatMessageHistory: Keeps track and stored the chat messages exchanged between user and AI model
#BasechatHistory: It is like a Template/Abstract that defines the structure of history management. ChatMessageHistory builds on top of this template
#RunnableWithMessageHistory: This Wraps chat like a chatbot and save chat history for future conversations making sure past conversations influence the responses.

store={}


def get_session_history(session_id:str)->BaseChatMessageHistory:
    if session_id not in store:
        store[session_id]=ChatMessageHistory()
    return store[session_id]

with_message_history=RunnableWithMessageHistory(model,get_session_history)

In [None]:
config={"configurable":{"session_id":"chat1"}}

In [None]:
from langchain_core.messages import HumanMessage

response=with_message_history.invoke(
    [HumanMessage(content="Hi , My name is Akilesh and I am a Chief AI Engineer")],
    config=config
)

In [None]:
response.content

## Prompt Template

In [37]:
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant specialing in Data Analysis. Answer all questions to the best of your ability in. There is a meeting of the Data Analysis team of Company Northeastern and there are several discussions help. The meeting Transcripts are recorded and understand the {transcript} and answer the Questions of the employee",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

chain = prompt | model

In [38]:
with_message_history=RunnableWithMessageHistory(
    chain,
    get_session_history,
    input_messages_key="messages"
)

In [39]:
employee="Akilesh"
designation="Junior Data Analyst"

config = {"configurable": {"session_id": "chat1"}}
repsonse=with_message_history.invoke(
    {'messages': [HumanMessage(content=f"""Can you summarize the meeting for {employee} who is {designation}.
                               Based on the designation refine the most important points that refine 80% essence of the conversation refined to the 
                               designation and what they will need as takeaways from the meeting. So keeping these factors generate a summary in  in 100 words
                               """)],"transcript":transcript_content},
    config=config
)
repsonse.content

"Hey Akilesh,  \n\nGood news! Your work on the ETL process and churn analysis is important to the team. We're fixing data consistency issues, so make sure to thoroughly test your changes before going live. Double-check data flow, transformations, and accuracy.\n\nFor churn analysis, keep refining the machine learning models. Gopi can help you prioritize important features using correlation tests and SHAP values.\n\nWe need a preliminary user engagement dashboard by Friday and a churn analysis summary by next week. Keep up the great work!\n\n\n\n"

### Chatbot: Continue chatting

In [40]:
response = with_message_history.invoke(
    {"messages": [HumanMessage(content="What all tests I should do before before pushing them to production")], "transcript":transcript_content},
    config=config,
)

response.content

"Great question, Akilesh!  You want to be extra sure your ETL changes are rock solid before they go live. Here's a checklist to guide your testing:\n\n**Functionality:**\n\n* **Data Flow:**  Make sure data moves smoothly from source to destination. Test each step of your pipeline.\n* **Transformations:** Verify your data transformations are accurate. Compare outputs to expected results.\n* **Data Quality:**\n\n    * **Completeness:**  All required fields have data.\n    * **Accuracy:**  Values are correct and error-free.\n    * **Consistency:**  Data follows format and standard rules.\n\n**Performance:**\n\n* **Load Testing:** Simulate high data volumes to see how your pipeline handles peak loads.\n* **Latency Testing:** Measure how long it takes data to move through each pipeline stage.\n\n**Regression:**\n\n* **Previous Functionality:**  Test that your changes didn't break anything that was working before.\n* **Data Consistency:**  Re-run tests on historical data to ensure your chang

## Basic RAG Chain

In [46]:
# Step 1: Load documents from Wikipedia
from langchain_community.document_loaders import WikipediaLoader
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# Load documents related to "Data Analytics in Finance" from Wikipedia
docs = WikipediaLoader(query="Data Analytics in Finance", load_max_docs=5).load()

# Step 2: Initialize the embeddings model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Step 3: Store the documents in a Chroma vector store using the embeddings
vectorstore = Chroma.from_documents(docs, embedding=embeddings)


In [47]:
# Step 4: Load the meeting transcript
with open("transcript.txt", "r") as file:
    transcript_content = file.read()

# Step 5: Define the prompt template for the initial summary generation
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant specializing in Data Analysis. Answer all questions to the best of your ability. There is a meeting of the Data Analysis team at Company Northeastern. The meeting transcripts are recorded. Understand the {transcript} and answer the questions of the employee.",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

# Step 6: Combine prompt with the model for generating the summary
chain = prompt | model


In [48]:
# Step 7: Set up session history using ChatMessageHistory and RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

# Store the session history in a dictionary
store = {}

# Define a function to get the session history
def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

# Wrap the chain with the history management
with_message_history = RunnableWithMessageHistory(chain, get_session_history, input_messages_key="messages")


In [56]:
# Step 8: Define the initial user prompt
employee = "Akilesh"
designation = "Junior Data Analyst"

# employee = "Elon"
# designation = "Vice President of BI Team"

config = {"configurable": {"session_id": "chat1"}}

# Step 9: Call the model to generate the initial summary based on the transcript
response = with_message_history.invoke(
    {
        'messages': [
            HumanMessage(content=f"""Can you summarize the meeting for {employee}, who is {designation}? 
                                     Based on the designation, refine the most important points and what 
                                     they will need as takeaways from the meeting. Generate a summary in 100 words."""
            )
        ],
        'transcript': transcript_content  # Passing the meeting transcript as context
    },
    config=config
)

# Print the generated summary
print(response.content)


Hi Akilesh,

Here's a summary of today's meeting focusing on points relevant to you as a Junior Data Analyst:

**Churn Analysis:**

* **Great progress!** Your work on the churn prediction model is progressing well, reaching 85% accuracy. 
* **Feature importance:**  Focus on refining features for the model. Test and prioritize features using correlation tests and SHAP values to identify the most impactful ones for churn prediction.
* **Executive summary:** Prepare a detailed executive summary of your findings for the end of next week, highlighting key insights and potential business impact of the model on customer retention.

**User Engagement Dashboard:**

* **De-duplication:** Your work on removing redundant data in the ETL process is crucial for improving the dashboard's performance. Keep up the great work on that!
* **Data integrity:** The team is addressing data inconsistencies across sources, especially with timestamps. Ensure data integrity is maintained for accurate metrics.

**

In [57]:
# Step 8: Define the initial user prompt
employee = "Riya"
designation = "Junior Data Analyst"

# employee = "Elon"
# designation = "Vice President of BI Team"

config = {"configurable": {"session_id": "chat1"}}

# Step 9: Call the model to generate the initial summary based on the transcript
response = with_message_history.invoke(
    {
        'messages': [
            HumanMessage(content=f"""Can you summarize the meeting for {employee}, who is {designation}? 
                                     Based on the designation, refine the most important points and what 
                                     they will need as takeaways from the meeting. Generate a summary in 100 words."""
            )
        ],
        'transcript': transcript_content  # Passing the meeting transcript as context
    },
    config=config
)

# Print the generated summary
print(response.content)

Hi Riya,

Here's a summary of today's meeting, focusing on points relevant to your role as a Junior Data Analyst:

**Churn Analysis:**

* **Great progress!**  You and Akilesh are doing a great job on the churn prediction model, achieving 85% accuracy.
* **Data Consistency is Key:**  Ensure data integrity across Salesforce and user engagement data is consistent, especially with timestamps. This is critical for accurate churn analysis.

**User Engagement Dashboard:**
* **Data Deduplication:** The team is working on removing redundant data in the ETL process, which is crucial for the dashboard's performance.

**General:**

* **Teamwork:** Everyone is working together to solve challenges and deliver on projects.

**Action Items:**
* **Data Integrity:** Make sure timestamps are consistent across Salesforce and user engagement data.
* **Executive Summary:** Help Akilesh on the executive summary due next week. 
* **Data Quality:**  Remember to test thoroughly before pushing code to production

In [53]:
# Step 10: Define a function to retrieve relevant documents from the vector store
from langchain_core.runnables import RunnableLambda

# Create a retriever to fetch relevant documents
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# Define a function to retrieve documents and combine them into context
def retrieve_documents(query: str):
    retrieved_docs = retriever.get_relevant_documents(query)
    # Combine the content of the retrieved documents into a single string to pass as context
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])
    return context

# Create a Runnable that wraps this retriever
retriever_runnable = RunnableLambda(retrieve_documents)


## Chatbot for Akilesh

In [51]:
# Step 11: Set up the new RAG chain with document retrieval and model generation
rag_chain = {
    "transcript": retriever_runnable,  # Retrieve relevant documents as 'transcript'
    "messages": RunnablePassthrough()  # Pass through the user's messages
} | prompt | model

# Step 12: Future query using document retrieval from vector store
user_query = "What all tests should I do before pushing them to production?"

# Retrieve relevant documents and pass them along with the user's query
response = with_message_history.invoke(
    {
        "messages": [HumanMessage(content=user_query)],
        "transcript": retrieve_documents(user_query)  # Fetch the relevant documents from the vector store
    },
    config=config
)

# Print the chatbot's response
print(response.content)


  retrieved_docs = retriever.get_relevant_documents(query)


That's a great question! Before pushing any data analysis model or system to production, you should conduct a thorough battery of tests to ensure accuracy, reliability, and robustness. Here's a breakdown of essential tests:

**1. Unit Tests:**

* **Individual Component Testing:**  Test each piece of your code (functions, modules) in isolation to ensure they perform as expected. 
* **Data Validation:** Verify that your code correctly handles different data types, formats, and edge cases (e.g., missing values, outliers).

**2. Integration Tests:**

* **System-Level Testing:** Test how different parts of your system work together. For example, if you have data processing, modeling, and visualization components, test their interactions.

**3. Performance Tests:**

* **Load Testing:**  Simulate a high volume of user requests or data processing to ensure your system can handle peak loads without crashing or degrading performance.
* **Stress Testing:** Push your system beyond its normal limit

## Chatbot for Elon

In [54]:
# Step 11: Set up the new RAG chain with document retrieval and model generation
rag_chain = {
    "transcript": retriever_runnable,  # Retrieve relevant documents as 'transcript'
    "messages": RunnablePassthrough()  # Pass through the user's messages
} | prompt | model

# Step 12: Future query using document retrieval from vector store
user_query = "What all are the key metrics on the projects progress that I can discuss with management"

# Retrieve relevant documents and pass them along with the user's query
response = with_message_history.invoke(
    {
        "messages": [HumanMessage(content=user_query)],
        "transcript": retrieve_documents(user_query)  # Fetch the relevant documents from the vector store
    },
    config=config
)

# Print the chatbot's response
print(response.content)


Here are key metrics you can discuss with management about the progress of your data analysis projects, categorized for clarity:

**1. Churn Analysis Project:**

* **Model Accuracy:** Report the current accuracy of the churn prediction model. Highlight any improvements made since last reporting.
* **Feature Importance:** Share insights on the most influential features driving churn predictions. This helps understand customer segments at highest risk.
* **Business Impact:** Estimate the potential impact of the model on churn rate reduction and revenue retention. Quantify this impact whenever possible (e.g., "Model predicts X% reduction in churn, saving Y dollars annually").
* **Deployment Timeline:**  Provide a realistic timeline for deploying the model into production, including key milestones.

**2. User Engagement Dashboard:**

* **Data Coverage:** Report the percentage of data successfully integrated into the dashboard. Specify any remaining data sources to be incorporated.
* **Real