In [None]:
import os
import re
import wget
from langchain.text_splitter import SpacyTextSplitter

%reload_ext dotenv
%dotenv

from langchain.agents import load_tools, Tool
from langchain.utilities.google_search import GoogleSearchAPIWrapper
from langchain.agents import initialize_agent
from langchain.llms import OpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from random import sample

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.question_answering import load_qa_chain

from langchain.prompts import PromptTemplate

from langchain.chains import LLMRequestsChain, LLMChain

from langchain.docstore.document import Document


# First, let's load the language model we're going to use to control the agent.
llm = OpenAI(temperature=0)


google_search = GoogleSearchAPIWrapper()

In [None]:
import requests
from tqdm.auto import tqdm
from bs4 import BeautifulSoup


def select_snippets_with_keywords(keyword: str, text: str, window_size: int = 500):
    # Window size is in characters
    all_keyword_indexes = [m.start()
                           for m in re.finditer(f"({keyword}).*[\\.]", text)]
    output = []
    for index in all_keyword_indexes:
        left_window = max(0, index - window_size)
        right_window = min(len(text), index + window_size)
        snippet = text[left_window:right_window]
        output.append(snippet)

    return output


def get_keywords(query):
    template = "Select at most 6 most meaningful keywords in lowercase and separated by a comma from this text : {text}"
    prompt = PromptTemplate(template=template, input_variables=["text"])
    get_keywords_chain = LLMChain(
        prompt=prompt, llm=OpenAI(temperature=0), verbose=True)

    return [k.strip() for k in get_keywords_chain.run(query).split(",")]


def google_search_about_insurance(query: str) -> str:
    keywords = get_keywords(query)
    print(keywords)

    google_query = "Cyber attack company financial loss business interruption"
    r = google_search._google_search_results(
        google_query
    )
    if len(r) == 0:
        return "No link"
    else:
        all_docs = []
        embeddings = HuggingFaceEmbeddings()
        vectorstore = None
        text_splitter = SpacyTextSplitter.from_tiktoken_encoder(
            chunk_size=500, chunk_overlap=0, pipeline="en_core_web_sm"
        )

        all_relevant_snippets = []

        for result in (pbar := tqdm(r[:20])):
            result_url = result["link"]  # Take the URL of the first result
            pbar.set_description(result_url)

            # Download file
            html = requests.get(result_url).text
            soup = BeautifulSoup(html, features="html.parser")
            # kill all script and style elements
            for script in soup(["script", "style"]):
                script.extract()    # rip it out

            # get text
            text = soup.get_text()

            # break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            # break multi-headlines into a line each
            chunks = (phrase.strip()
                      for line in lines for phrase in line.split("  "))
            # drop blank lines
            document_text = '\n'.join(chunk for chunk in chunks if chunk)
            if len(document_text) < 100:
                continue

            relevant_snippets = []

            snippets_with_keyword = []
            for keyword in keywords:
                snippets_with_keyword = select_snippets_with_keywords(
                    keyword, document_text, window_size=300)
            snippets_with_keyword = text_splitter.split_text(
                "\n".join(snippets_with_keyword))

            min_snippets = 1
            max_snippets = 5
            max_vector_store = 20

            if len(snippets_with_keyword) > max_snippets:
                if len(snippets_with_keyword) > max_vector_store:
                    snippets_with_keyword = sample(
                        snippets_with_keyword, max_vector_store)
                vectorstore = FAISS.from_texts(
                    snippets_with_keyword, embeddings)
                relevant_snippets = vectorstore.similarity_search(
                    google_query, max_snippets)
            elif 0 < len(snippets_with_keyword) < min_snippets:
                all_snippets = text_splitter.split_text(document_text)
                random_snippets = sample(all_snippets, min(
                    max_vector_store - len(snippets_with_keyword), len(all_snippets)))
                vectorstore = FAISS.from_texts(
                    snippets_with_keyword, embeddings)
                relevant_snippets = text_splitter.create_documents(
                    snippets_with_keyword)
                relevant_snippets += vectorstore.similarity_search(
                    google_query, max_snippets - len(snippets_with_keyword))

            else:
                relevant_snippets = snippets_with_keyword

            all_relevant_snippets += relevant_snippets

        all_relevant_snippets = [
            snip for snip in all_relevant_snippets if isinstance(snip, str)]

        # Select the most relevant snippets from the collection
        vectorstore = FAISS.from_texts(all_relevant_snippets, embeddings)
        docs = vectorstore.similarity_search(query, k=3)

        # Reply to the question
        chain = load_qa_chain(llm, chain_type="stuff", verbose=True)
        summary = chain.run(input_documents=docs, question=query)
        return summary


google_search_insurance = Tool(
    "Google Search about insurance",
    google_search_about_insurance,
    "A wrapper around Google Search.",
)

tools = [google_search_insurance]


# Finally, let's initialize an agent with the tools, the language model, and the type of agent we want to use.
agent = initialize_agent(
    tools, llm, agent="zero-shot-react-description", verbose=True)

agent.run("""List several historical losses due to business interruption caused by cyber-attacks.
For each example, list the year, the company name, the duration of business interruption and the financial cost.
For instance: In 2018, an attack at company Saint-Gobain caused 10 days of business interruption, with a financial loss of 700M$.
""")


# End of notebook