In [1]:
!pip install -q faiss-cpu langchain langchain-community langchain-experimental langchain-cohere

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m291.9/291.9 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# @title Imports

import os
import re
import requests
import numpy as np
import pandas as pd
from google.colab import userdata
from IPython.display import display

# LangChain & LangChain Community
import faiss
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.vectorstores.faiss import FAISS
from langchain.chains import LLMChain, RetrievalQA, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_experimental.agents import create_pandas_dataframe_agent

In [None]:
# @title Download excel files

headers = {
    'User-Agent': 'microsoft bill.gates@microsoft.com',
    'Accept': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
}

response = requests.get("https://www.sec.gov/Archives/edgar/data/0000320193/000032019321000105/Financial_Report.xlsx", headers=headers)
with open("/content/Apple_Financial_2021.xlsx", 'wb') as f:
    f.write(response.content)

response = requests.get("https://www.sec.gov/Archives/edgar/data/0001682852/000168285225000022/Financial_Report.xlsx", headers=headers)
with open("/content/Moderna_Financial_2025.xlsx", 'wb') as f:
    f.write(response.content)

response = requests.get("https://www.sec.gov/Archives/edgar/data/0000064803/000006480324000007/Financial_Report.xlsx", headers=headers)
with open("/content/CVS_Health_Financial_2024.xlsx", 'wb') as f:
    f.write(response.content)

response = requests.get("https://www.sec.gov/Archives/edgar/data/0001318605/000095017023001409/Financial_Report.xlsx", headers=headers)
with open("/content/Tesla_Financial_2023.xlsx", 'wb') as f:
    f.write(response.content)

In [None]:
# @title excel_to_df(excel_file_path: str) - Convert excel data to dataframe
def excel_to_df(excel_file_path: str):
    sheet_name_desc1 = 'consolidated statements of oper'
    sheet_name_desc2 = 'consolidated statements of incom'
    global files_parsed, files_not_parsed
    try:
        matched_sheet = ''
        excel_content = pd.ExcelFile(excel_file_path)
        for sheet_name in excel_content.sheet_names:
            if sheet_name_desc1 in sheet_name.lower() or sheet_name_desc2 in sheet_name.lower():
              matched_sheet = sheet_name
              break
        df = pd.read_excel(excel_file_path, sheet_name = matched_sheet, engine = 'openpyxl')
        files_parsed.append(excel_file_path)
    except Exception as e:
        # If df is not found then return 'Required sheet not found!'
        df = f'Could not read the excel file {excel_file_path}!'
        files_not_parsed.append(excel_file_path)

    return df

In [None]:
# @title def process_df_values(df: pd.dataframe, share_multiplier: int, usd_multiplier: int) -> Correct the values in dataframe according to share and usd multipliers

def process_df_values(df: pd.DataFrame, share_multiplier: int, usd_multiplier: int) -> pd.DataFrame:
    pre_string = ''
    nan_index, text_index = [], []

    common_averages = ["in share", "average share", "average common share"]
    usd_terms = ["in usd", "in dollars", "income per share"]

    for index, row in df.iterrows():
        try:
            row_desc = row.loc['Description'].lower() # Get the row description
            if row_desc[-1] == ":":
                row_desc = row_desc[:-1]
        except:
            continue

        if row.isnull().all():
          pre_string = ''
          nan_index.append(index)

        elif row.iloc[2:].isnull().all():
          if index - 1 in nan_index:
            pre_string = pre_string + " | " + row_desc #re.sub(r'[^a-zA-Z]+$[]', '', row_desc)
          else:
            pre_string = row_desc #re.sub(r'[^a-zA-Z]+$[]', '', row_desc)
          nan_index.append(index)

        else:
            # If value can't be converted to float, then skip that row
            try:
                float(row.iloc[2])

                if pre_string:
                    df.iloc[index, 0] = pre_string

                for col in df.columns[2:]:
                    val = row[col]
                    if any(common_average in row_desc for common_average in common_averages) or any(common_average in pre_string for common_average in common_averages):
                        val = float(val) * share_multiplier
                        # val = str(val) + " shares"
                    else:
                        if all(usd_term not in row_desc and usd_term not in pre_string for usd_term in usd_terms):
                            val = float(val) * usd_multiplier
                            # if val >= 1000000000:
                            #     val = str(val/1000000000) + " billion"
                            # elif val >= 1000000:
                            #     val = str(val/1000000) + " million"

                        # val = "USD " + str(val)

                    df.at[index, col] = val
            except:
                text_index.append(index)
                continue

    if text_index:
      df = df.drop(text_index)

    df = df.dropna(subset=df.columns[1:])

    return df.reset_index(drop = True)

In [None]:
# @title def process_dataframe(df: pd.dataframe) -> Set column headings and find share and usd multipliers

def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    multiplier_identifier = df.columns[0].lower()

    share_multiplier = 1
    if 'shares in billion' in multiplier_identifier:
        share_multiplier = 1000000000
    elif 'shares in million' in multiplier_identifier:
        share_multiplier = 1000000
    elif 'shares in thousand' in multiplier_identifier:
        share_multiplier = 1000

    usd_multiplier = 1
    if '$ in billion' in multiplier_identifier:
        usd_multiplier = 1000000000
    elif '$ in million' in multiplier_identifier:
        usd_multiplier = 1000000
    elif '$ in thousand' in multiplier_identifier:
        usd_multiplier = 1000

    df.columns = [f'new_column_{i}' if col is None else col for i, col in enumerate(df.columns)]

    df = df.map(lambda x: np.nan if (x == '' or x is None or (isinstance(x, str) and x.strip() == "")) else x)

    for column in df.columns:
        if df[column].nunique() < 5:
            df = df.drop(columns=[column])

    df = df.rename(columns = {df.columns[0]: "Description",
                              df.columns[1]: int(df.iloc[0,1][-4:]),
                              df.columns[2]: int(df.iloc[0,2][-4:]),
                              df.columns[3]: int(df.iloc[0,3][-4:])})

    df = df[1:]
    df.insert(0, 'Category', '')
    df = df.reset_index(drop = True)      # To make sure indices are in sync

    return process_df_values(df = df, share_multiplier = share_multiplier, usd_multiplier = usd_multiplier)


In [None]:
# @title def df_to_document(df:  pd.DataFrame, company_name: str) -> Convert dataframe to langchain Documents
def df_to_document(df:  pd.DataFrame, company_name: str):
    documents = []

    for index, row in df.iterrows():
      if row.iloc[0]:
        # row_desc = f"{row.iloc[1]} in {row.iloc[0]} category"
        row_desc = f"For {company_name} {row.iloc[1]} in {row.iloc[0]} category for year"
      else:
        # row_desc = f"{row.iloc[1]}"
        row_desc = f"For {company_name} {row.iloc[1]} for year"

      for column in df.columns[2:]:
        metadata = {"company": company_name, "year": column}
        documents.append(Document(page_content = f"{row_desc} {column} is {row[column]}.", metadata = metadata))

    return documents

In [None]:
# @title MAIN

file_list = ['/content/Apple_Financial_2021.xlsx', '/content/Moderna_Financial_2025.xlsx', '/content/CVS_Health_Financial_2024.xlsx', '/content/Tesla_Financial_2023.xlsx']
files_parsed, files_not_parsed = [], []

docs = []

# Convert excel to dataframe
for excel_file_path in file_list:

    company_name = (excel_file_path.split('/')[-1]).split('_')[0]
    df = excel_to_df(excel_file_path = excel_file_path )
    if not isinstance(df, pd.DataFrame):
        files_not_parsed.append(excel_file_path)
        continue

    files_parsed.append(excel_file_path)
    df = process_dataframe(df = df)
    # display(df)
    # print("\n\n")
    docs.extend(df_to_document(df = df, company_name = company_name))


In [None]:
# @title This cell restarts this notebook. Re-run all cells after restart
!pip uninstall -qy langchain-cohere
!pip install -q "langchain-cohere>=0.1.0,<0.2.0"

# Cohere integrations
from langchain_cohere import ChatCohere
from langchain_cohere import CohereEmbeddings
from langchain_community.llms import Cohere as LangChainCohere

In [None]:
# @title Embedding and Cohere Configuration

cohere_api_key = userdata.get('COHERE_API_KEY')
embeddings = CohereEmbeddings(cohere_api_key=cohere_api_key, user_agent="langchain")

vectorstore = FAISS.from_documents(docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 15})

llm = ChatCohere(cohere_api_key=cohere_api_key)

qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [None]:
# @title Query 1 to Cohere

# This query works good
query = "Compare Moderna and Tesla's net income"
response = qa_chain({"query": query})

print("\nAnswer:")
print(response['result'])

In [None]:
# @title Query 2 to Cohere

# This query does not work good
query = "What was CVS's net income in 2022?"
response = qa_chain({"query": query})

print("\nAnswer:")
print(response['result'])