# Video Indexer transcripts analysis with Langchain + Azure OpenAI + Azure Cognitive Search (vector store)

In [1]:
import os
import json
import gradio as gr
import requests
import sys
import time

from dotenv import load_dotenv

from langchain.chat_models import AzureChatOpenAI
from langchain.chains import RetrievalQA
from langchain.retrievers import AzureCognitiveSearchRetriever
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import AzureSearch

In [2]:
sys.version

'3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]'

In [3]:
load_dotenv("azure.env")

True

In [None]:
acs_endpoint = os.getenv("AZURE_COGNITIVE_SEARCH_ENDPOINT")
acs_key = os.getenv("AZURE_COGNITIVE_SEARCH_API_KEY")

In [4]:
index_name = "videoindexer-transcripts"
index_name

'videoindexer-transcripts'

## Documents

In [5]:
DOCS_DIR = "transcripts"

In [6]:
!ls $DOCS_DIR/*.* -lh

-rwxrwxrwx 1 root root 6.3K Sep  6 12:44 transcripts/Saint_Gobain_2023.csv
-rwxrwxrwx 1 root root 4.5K Sep  6 13:20 transcripts/St_Gobain_materials.csv


In [7]:
# Loop through the folders
docs = []
for dirpath, dirnames, filenames in os.walk(DOCS_DIR):
    for file in filenames:
        print(file)
        try:
            loader = TextLoader(os.path.join(dirpath, file), encoding="utf-8")
            docs.extend(loader.load_and_split())
        except Exception as e:
            pass

Saint_Gobain_2023.csv
St_Gobain_materials.csv


In [8]:
# Split into chunk of texts
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

## Embeddings and loading the documents into the Azure Cognitive Search

In [9]:
# Azure Cognitive Search index name to create
index_name

'videoindexer-transcripts'

In [10]:
# Initialize our embedding model
embeddings = OpenAIEmbeddings(
    deployment=os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME"),
    model=os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME"),
    openai_api_base=os.getenv("OPENAI_API_BASE"),
    openai_api_type="azure",
    chunk_size=1,
)


# Set our Azure Search
acs = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_COGNITIVE_SEARCH_ENDPOINT"),
    azure_search_key=os.getenv("AZURE_COGNITIVE_SEARCH_API_KEY"),
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

# Add documents to Azure Search
acs.add_documents(documents=texts)

['ZGU3OTZjMzAtYzQ0ZC00NDNhLTg3Y2EtMmI0YzMzNGM0ZjY5',
 'ZGEwMTM2NGUtMTA3OC00MzMyLWI1YmEtODk1Y2JmMTdjMWU1',
 'ZDNhOWU5YTgtOWM1Yi00ODQ2LWI5NGYtYWNkNGJhZDVjNzdh',
 'NzZiZGIyYzQtOGZiOS00MDA4LWI3ZDQtMmQ3OTcwNzg2YWRj']

In [11]:
# Define Azure Cognitive Search as our retriever
retriever = AzureCognitiveSearchRetriever(
    content_key="content", top_k=10, index_name=index_name
)

In [12]:
# Set chatGPT 3.5 as our LLM
llm = AzureChatOpenAI(deployment_name="gpt-35-turbo-16k", temperature=0.7)

In [13]:
retriever

AzureCognitiveSearchRetriever(tags=None, metadata=None, service_name='azurecogsearcheastussr', index_name='videoindexer-transcripts', api_key='ViHEHiP4CdH3zH0BYLDgHG0DKr6yHoTwbWXR4F90ujAzSeDP6Y0a', api_version='2020-06-30', aiosession=None, content_key='content', top_k=10)

In [14]:
llm

AzureChatOpenAI(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, client=<class 'openai.api_resources.chat_completion.ChatCompletion'>, model_name='gpt-3.5-turbo', temperature=0.7, model_kwargs={}, openai_api_key='8d0786663aa1480f9dee3c9edd842b1a', openai_api_base='https://azure-openai-serge.openai.azure.com', openai_organization='', openai_proxy='', request_timeout=None, max_retries=6, streaming=False, n=1, max_tokens=None, tiktoken_model_name=None, deployment_name='gpt-35-turbo-16k', model_version='', openai_api_type='azure', openai_api_version='2023-05-15')

## Testing

In [15]:
# Define a template message
template = """You are analyzing a transcript text file that contains the speech to text results from a video file. 
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# Set the Retrieval QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)

In [16]:
questions = ["Could you summarize in a couple of lines the document Saint_Gobain_2023.csv?"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: Could you summarize in a couple of lines the document Saint_Gobain_2023.csv?
[1;31;32m
Answer: The document Saint_Gobain_2023.csv is a transcript of a video presentation highlighting the achievements and innovations of Saint-Gobain in the year 2022. It emphasizes the company's commitment to sustainability, environmental responsibility, and social impact. The transcript mentions the development of low-carbon glass and plaster products, expansion into new markets, and investments in industrial capacity. Overall, it showcases Saint-Gobain's focus on transforming interior spaces and its dedication to being a leader in the construction and manufacturing industry.


In [17]:
questions = ["What is the language of the document Saint_Gobain_2023.csv"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: What is the language of the document Saint_Gobain_2023.csv
[1;31;32m
Answer: Based on the content of the document, the language of the document Saint_Gobain_2023.csv appears to be French.


In [18]:
questions = ["Can you generate 10 keywords from Saint_Gobain_2023.csv?"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: Can you generate 10 keywords from Saint_Gobain_2023.csv?
[1;31;32m
Answer: 1. Interior materials
2. Luxury
3. Planet Lac Evolution
4. Premium lacquered glass
5. Vibrant living spaces
6. Design possibilities
7. Manufacturing process
8. Bonding
9. Quality parameters
10. Innovation


In [19]:
questions = ["Can you generate some hashtags from Saint_Gobain_materials.csv?"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: Can you generate some hashtags from Saint_Gobain_materials.csv?
[1;31;32m
Answer: #InteriorMaterials #LuxuryInteriors #PlanetLacEvolution #ArtisticExcellence #VibrantLivingSpaces #UnlimitedDesignPossibilities #InteriorSurfaceDesign #GlassManufacturing #BondingProcess #HighQualityGlass #LabTesting #SaintGobainMaterials


In [35]:
questions = ["You are a twitter redactor. Write a tweeter post about the content of Saint_Gobain_materials.csv?\
Use some smileys"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: You are a twitter redactor. Write a tweeter post about the content of Saint_Gobain_materials.csv?Use some smileys
[1;31;32m
Answer: Just analyzed the transcript from Saint_Gobain_materials.csv 📑🔍 So many amazing interior materials and glass products! 🏢✨ They redefine luxury and create vibrant living spaces. 🌈🏠 Saint-Gobain is all about innovation and quality. 👏🌟 #InteriorDesign #LuxuryLiving #Innovation


In [21]:
questions = ["What are the countries names from Saint_Gobain_2023.csv"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: What are the countries names from Saint_Gobain_2023.csv
[1;31;32m
Answer: Based on the provided transcript, the following country names are mentioned in the Saint_Gobain_2023.csv file: Canada, Mexico, Brazil, Egypt, Finland, Norway, Germany.


In [22]:
questions = ["Do we have a mention of 'Planilac lacquer' in the St_Gobain.csv file?"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: Do we have a mention of 'Planilac lacquer' in the St_Gobain.csv file?
[1;31;32m
Answer: Yes, there are several mentions of 'Planilac lacquer' in the St_Gobain.csv file.


In [23]:
questions = ["How many different stages do we have in the process of curing of glass in the St_Gobain.csv file?"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: How many different stages do we have in the process of curing of glass in the St_Gobain.csv file?
[1;31;32m
Answer: There are 12 different stages in the process of curing of glass in the St_Gobain.csv file.


In [24]:
questions = ["Display the timeframe of the curing of glass process in the St_Gobain.csv file?"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: Display the timeframe of the curing of glass process in the St_Gobain.csv file?
[1;31;32m
Answer: The timeframe of the curing of glass process in the St_Gobain.csv file is from 00:02:49.120 to 00:02:56.400.


In [25]:
questions = ["What are the different tests in the St_Gobain_materials.csv file?"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: What are the different tests in the St_Gobain_materials.csv file?
[1;31;32m
Answer: The different tests mentioned in the St_Gobain_materials.csv file are Tabor tests, Color value test, and high humidity test.


In [26]:
questions = ["Display the timeframe of the the 'Color value tests' in the St_Gobain_materials.csv file? Just print the values \
             like a json file"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: Display the timeframe of the the 'Color value tests' in the St_Gobain_materials.csv file? Just print the values              like a json file
[1;31;32m
Answer: {
  "Color value tests": {
    "start_time": "00:03:23.428",
    "end_time": "00:03:28.158"
  }
}


In [27]:
questions = ["Display the timeframe of the curing of glass process in the St_Gobain.csv file? Just print the values \
             like a json file"]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: Display the timeframe of the curing of glass process in the St_Gobain.csv file? Just print the values              like a json file
[1;31;32m
Answer: {
  "start_time": "00:01:38.390",
  "end_time": "00:01:46.276"
}


In [28]:
questions = ["Display the timeframe of the mention of Canada in the St_Gobain.csv file? Just print the values \
like a json file."]

chat_history = []

for question in questions:
    result = qa_chain({"query": question, "chat_history": chat_history})
    #chat_history.append((question, result))
    print("\033[1;31;34m")
    print(f"Question: {question}")
    print("\033[1;31;32m")
    print(f"Answer: {result['result']}")

[1;31;34m
Question: Display the timeframe of the mention of Canada in the St_Gobain.csv file? Just print the values like a json file.
[1;31;32m
Answer: {"start_time": "00:01:20.270", "end_time": "00:01:25.945"}


## Webapp

In [29]:
def webapp_function(prompt):
    """
    Gradio function
    """
    template = """You are analyzing a transcript text file that contains the speech to text results from a video file. 
    {context}
    Question: {question}
    Helpful Answer:"""

    QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    )
    questions = [prompt]
    chat_history = []

    for question in questions:
        result = qa_chain({"query": question, "chat_history": chat_history})
        answer = result['result']
        
    return answer

In [30]:
logo = "https://github.com/retkowsky/images/blob/master/aoai.jpg?raw=true"
image = "<center> <img src= {} width=240px></center>".format(logo)
title = "Azure Open AI Chat GPT on Video Indexer transcripts"

samples = [
    "Display the timeframe of the curing of glass process in the St_Gobain.csv file?",
    "In which documents are we talking about the process of curing of glass?",
    "How many different stages do we have in the process of curing of glass in the St_Gobain.csv file?",
    "Display the timeframe of the mention of Canada in the St_Gobain.csv file? Just print the values like a json file.",
    "You are a twitter redactor. Write a tweeter post about the content of St_Gobain.csv? Use some smileys.",
    "Could you summarize in a couple of lines the document Saint_Gobain_2023.csv?",
    "Could you summarize St_Gobain.csv?",
    "Can you generate 10 keywords from Saint_Gobain_2023.csv?",
]

inputs = [gr.Textbox(label="Your prompt:")]
outputs = "text"

# Webapp
webapp = gr.Interface(fn=webapp_function,
                      inputs=inputs,
                      outputs=outputs,
                      description=image,
                      title=title,
                      examples = samples,
                      theme="Taithrah/Minimal",  # https://huggingface.co/spaces/gradio/theme-gallery
                     )

# Running the webapp
webapp.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://8824376493967d51e6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




## Post Processing

In [31]:
def acs_delete_index(index_name):
    """
    Deleting an Azure Cognitive Search index
    Input: an Azure Cognitive Search index name
    """
    start = time.time()
    
    search_client = SearchIndexClient(
        endpoint=os.getenv("AZURE_COGNITIVE_SEARCH_ENDPOINT"),
        credential=AzureKeyCredential(os.getenv("AZURE_COGNITIVE_SEARCH_API_KEY"))
    )
    
    try:
        print("Deleting the Azure Cognitive Search index:", index_name)
        search_client.delete_index(index_name)
        print("Done. Elapsed time:", round(time.time() - start, 2), "secs")
    except:
        print("Cannot delete index. Check the index name.")

In [40]:
def acs_index_stats(index_name):
    """
    Get statistics about Azure Cognitive Search index
    Input: an Azure Cognitive Search index name
    """
    url = (
        os.getenv("AZURE_COGNITIVE_SEARCH_ENDPOINT")
        + "/indexes/"
        + index_name
        + "/stats?api-version=2021-04-30-Preview"
    )
    headers = {
        "Content-Type": "application/json",
        "api-key": os.getenv("AZURE_COGNITIVE_SEARCH_API_KEY"),
    }
    response = requests.get(url, headers=headers)
    print("Azure Cognitive Search index status for:", index_name, "\n")

    if response.status_code == 200:
        res = response.json()
        print(json.dumps(res, indent=2))
        document_count = res["documentCount"]
        storage_size = res["storageSize"]

    else:
        print("Request failed with status code:", response.status_code)

    return document_count, storage_size

In [41]:
def acs_index_status(index_name):
    """
    Azure Cognitive Search index status
    Input: an Azure Cognitive Search index name
    """
    print("Azure Cognitive Search Index:", index_name, "\n")

    headers = {"Content-Type": "application/json", "api-key": os.getenv("AZURE_COGNITIVE_SEARCH_API_KEY")}
    params = {"api-version": "2021-04-30-Preview"}
    index_status = requests.get(
        os.getenv("AZURE_COGNITIVE_SEARCH_ENDPOINT") + "/indexes/" + index_name, headers=headers, params=params
    )

    try:
        print(json.dumps((index_status.json()), indent=5))
    except:
        print("Request failed with status code:", response.status_code)

In [42]:
acs_index_status(index_name)

Azure Cognitive Search Index: videoindexer-transcripts 

{
     "@odata.context": "https://azurecogsearcheastussr.search.windows.net/$metadata#indexes/$entity",
     "@odata.etag": "\"0x8DBAEE04B64F416\"",
     "name": "videoindexer-transcripts",
     "defaultScoringProfile": null,
     "fields": [
          {
               "name": "id",
               "type": "Edm.String",
               "searchable": false,
               "filterable": true,
               "retrievable": true,
               "sortable": false,
               "facetable": false,
               "key": true,
               "indexAnalyzer": null,
               "searchAnalyzer": null,
               "analyzer": null,
               "normalizer": null,
               "synonymMaps": []
          },
          {
               "name": "content",
               "type": "Edm.String",
               "searchable": true,
               "filterable": false,
               "retrievable": true,
               "sortable": false,
   

In [43]:
acs_index_stats(index_name)

Azure Cognitive Search index status for: videoindexer-transcripts 

{
  "@odata.context": "https://azurecogsearcheastussr.search.windows.net/$metadata#Microsoft.Azure.Search.V2021_04_30_Preview.IndexStatistics",
  "documentCount": 4,
  "storageSize": 139872
}


(4, 139872)

In [44]:
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential

In [45]:
acs_delete_index(index_name)

Deleting the Azure Cognitive Search index: videoindexer-transcripts
Done. Elapsed time: 0.62 secs
