In [19]:
from dotenv import dotenv_values
ENV = dotenv_values(".env")

In [9]:
import json
def load_input_file(input_file_location):
  # Load the JSON file
  data = None
  with open(input_file_location) as file:
      data = json.load(file)
  return data

In [8]:
def basic_convert_10Q_to_txt(input_file_location, output_file_location):
    data = load_input_file(input_file_location)
    # Write content to the file
    with open(output_file_location, 'w') as file:
      for item in data:
        item_type = item['type']
        if item_type == "Title" or item_type == "NarrativeText":
          file.write('\n' + item['text'] + '\n')
        elif item_type == "Table":
          file.write(item['metadata']['text_as_html'])

In [30]:
basic_convert_10Q_to_txt(
  './data/preprocessed_pdf_data/apple_10-Q-Q2-2024-As-Filed-Copy1.pdf.json',
  './data/final_data/apple_10-Q-Q2-2024-As-Filed-Copy1.pdf.json.txt'
)

In [28]:
basic_convert_10Q_to_txt(
  './data/preprocessed_pdf_data/apple_10-Q-Q3-2024-As-Filed-Copy1.pdf.json',
  './data/final_data/apple_10-Q-Q3-2024-As-Filed-Copy1.pdf.json.txt'
)

In [41]:
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.messages import HumanMessage, SystemMessage
GEMINI_API_KEY = ENV.get('GEMINI_API_KEY')

#ADAPTED FROM https://medium.com/kx-systems/high-precision-rag-for-table-heavy-documents-using-langchain-unstructured-io-kdb-ai-22f7830eac9a
def get_table_description(table_content, table_context):
    prompt = f"""
    Given the following table and its context from the original document,
    provide a short description of the table.

    Original Document Context:
    {table_context}

    Table Content:
    {table_content}

    Please provide:
    1. A comprehensive description of the table.
    """
    
    #CHANGED TO GEMINI
    #src: https://pypi.org/project/langchain-google-genai/
    messages = [
      SystemMessage(content="You are a helpful assistant that describes tables."),
      HumanMessage(content=prompt)
    ]

    llm = GoogleGenerativeAI(model="models/gemini-1.5-flash", api_key=GEMINI_API_KEY)
    response = llm.invoke(messages)

    return response
  
  
def extract_related_table_context(table_index, data):
  i = table_index - 1 #start one before the table
  if i < 0:
    raise Exception('Could not find context with index i:' + i)
  item = data[i]
  context = ''
  while item['type'] != 'Title':
    context = item['text'] + '\n' + context
    i -= 1 #look back one more
    item = data[i]
  context = '\nTitle: ' + item['text'] + context
  return context

In [62]:
def advanced_convert_10Q_to_txt(input_file_location, output_file_location):
  data = load_input_file(input_file_location)
  # Write content to the file
  with open(output_file_location, 'w') as file:
    for i, item in enumerate(data):
      item_type = item['type']
      if item_type == "Title" or item_type == "NarrativeText":
        file.write('\n' + item['text'] + '\n')
      elif item_type == "Table":
        table_content = item['metadata']['text_as_html']
        table_context = extract_related_table_context(i, data)
        description = get_table_description(table_content, table_context)
        enhanced_table = description + '\n' + table_content
        # print("--------------------\n" + enhanced_table + "\n-----------------")
        file.write(enhanced_table)
      # elif item_type == "Footer" or item_type == "ListItem":
      #   #discard
      #   print()
      # else:
      #   print(item['text'])
      #   raise Exception('Type not accounted for ' + item_type) 

In [57]:
# table_content = '<table><thead><tr><th></th><th colspan="2">Three Months Ended</th><th colspan="2">Six Months Ended</th></tr><tr><th></th><th>March 30, 2024</th><th>April 1, 2023</th><th>March 30, 2024</th><th>April 1, 2023</th></tr></thead><tbody><tr><td>Net income</td><td>23,636 $</td><td>$ 24,160</td><td>$ 57,552</td><td>$ 54,158</td></tr><tr><td colspan="5">Other comprehensive income/(loss):</td></tr><tr><td>Change in foreign currency translation, net of tax</td><td>(322)</td><td>(95)</td><td>(14)</td><td>(109)</td></tr><tr><td colspan="5">Change in unrealized gains/losses on derivative instruments, net of tax:</td></tr><tr><td>Change in fair value of derivative instruments</td><td>456</td><td>(13)</td><td>(75)</td><td>(1,001)</td></tr><tr><td>Adjustment for net (gains)/losses realized and included in net income</td><td>232</td><td>(191)</td><td>(591)</td><td>(1,957)</td></tr><tr><td>Total change in unrealized gains/losses on derivative instruments</td><td>688</td><td>(204)</td><td>(666)</td><td>(2,958)</td></tr><tr><td colspan="5">Change in unrealized gains/losses on marketable debt securities, net of tax:</td></tr><tr><td>Change in fair value of marketable debt securities</td><td>@)</td><td>1,403</td><td>3,038</td><td>2,303</td></tr><tr><td>Adjustment for net (gains)/losses realized and included in net income</td><td>59</td><td>62</td><td>134</td><td>127</td></tr><tr><td>Total change in unrealized gains/losses on marketable debt securities</td><td>52</td><td>1,465</td><td>3,172</td><td>2,430</td></tr><tr><td>Total other comprehensive income/(loss)</td><td>418</td><td>1,166</td><td>2,492</td><td>(637)</td></tr><tr><td>Total comprehensive income</td><td>24,054 $</td><td>$ 25,326</td><td>$ 60,044</td><td>$ 53,521</td></tr></tbody></table>'
# title = 'CONDENSED CONSOLIDATED STATEMENTS OF COMPREHENSIVE INCOME (Unaudited)'
# table_context = 'Title: ' + title + 'from the company Apple'
# table_description = get_table_description(table_content, table_context)

In [None]:
advanced_convert_10Q_to_txt('./data/preprocessed_pdf_data/apple_10-Q-Q2-2024-As-Filed-Copy1.pdf.json', './data/final_v2_data/apple_10-Q-Q2-2024-As-Filed-Copy1.pdf.json.txt')

In [None]:
advanced_convert_10Q_to_txt('./data/preprocessed_pdf_data/apple_10-Q-Q3-2024-As-Filed-Copy1.pdf.json', './data/final_v2_data/apple_10-Q-Q3-2024-As-Filed-Copy1.pdf.json.txt')

In [64]:
def convert_news_to_txt(input_file_location, output_file_location):
  data = load_input_file(input_file_location) 
  with open(output_file_location, 'w') as file:
    for item in data:
      if item['title'] != '[Removed]':
        file.write('\n' + item['title'] + '\n' + item['description'] + '\n')

In [38]:
convert_news_to_txt('./data/_raw_data/apple-recent-news.json', './data/final_data/apple-recent-news.json.txt')