In [1]:
import os
import pdfplumber
import openai
import time
from openai import ChatCompletion
from openai.error import RateLimitError, ServiceUnavailableError, APIError
import re


def extract_dynamic_string(string):
    prefixes = ['Culper_', 'NASDAQ_', 'Glasshouse ']
    for prefix in prefixes:
        pattern = prefix + '([a-zA-Z0-9]+)'
        match = re.search(pattern, string)
        if match:
            return match.group(1)

    return 'No match found'

Company_names={}

def remove_non_numeric_paragraphs(text):
    paragraphs = text.split('\n')
    new_text = []
    for paragraph in paragraphs:
        if re.match(r'^\d+', paragraph.strip()):
            new_text.append(paragraph)

    return '\n'.join(new_text)

# Set up the OpenAI GPT API key
openai.api_key = "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"

# Specify the root of the directory you want to iterate through
root_dir = "/Users/ferdinandhgjesdahl/Documents/pdffiler"

# Create a new directory for the summaries
summaries_dir = '/Users/ferdinandhgjesdahl/Documents/txtfiler'
if not os.path.exists(summaries_dir):
    os.makedirs(summaries_dir)



# Iterate through all the subdirectories
for dirpath, dirnames, filenames in os.walk(root_dir):
    for filename in filenames:
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(dirpath, filename)  # Full path to the pdf file
            pdf_name = os.path.basename(pdf_path)  # Extract the file name from the path
            company_name = extract_dynamic_string(pdf_name)  # Extract the company name from the file name

            # Read the PDF file
            with pdfplumber.open(pdf_path) as pdf:
                all_text = ""
                for pdf_page in pdf.pages:
                    all_text += pdf_page.extract_text()

            # Split the texts into smaller parts under 2048 characters
            text_parts = [all_text[i:i+2048] for i in range(0, len(all_text), 2048)]

            backoff_time = 1  # Initial value for backoff time
            # Process each text part with GPT and save the summary
            all_summaries = []
            for i, part in enumerate(text_parts):
                success = False
                while not success:
                    try:
                        chat = ChatCompletion.create(
                            model="gpt-3.5-turbo",
                            messages=[
                                {"role": "system", "content": "You are a helpful assistant."},
                                {"role": "user", "content": "Summarize the document, give me the most important points."},
                                {"role": "assistant", "content": part}
                            ]
                        )
                        success = True
                        backoff_time = 1  # Reset backoff time after a successful request
                    except (RateLimitError, ServiceUnavailableError, APIError, TimeoutError,Exception) as e:
                        print(f'Rate limit or service unavailable, or timeout occurred, waiting for {backoff_time} seconds...')
                        time.sleep(backoff_time)
                        backoff_time *= 2  # Double the wait time

                summary = chat['choices'][0]['message']['content']
                all_summaries.append(summary)

            # Merge all summaries into a single string
            final_summary = "\n".join(all_summaries)

            # Add company title at the beginning
            final_summary = f"Title: {company_name}\n" + final_summary

            # Check if final_summary is too long for the model
            if len(final_summary) > 4096:  
                final_summary_parts = [final_summary[i:i+4096] for i in range(0, len(final_summary), 4096)]  
            else:  
                final_summary_parts = [final_summary]  

            final_summaries = []  
            for final_summary_part in final_summary_parts:  
                success = False  
                while not success:  
                    try:  
                        chat = ChatCompletion.create(  
                            model="gpt-3.5-turbo",  
                            messages=[  
                                {"role": "system", "content": "You are a financial analyst assistant."},  
                                {"role": "user", "content": "List 10 reasons why the stock might be a potential short sale based on the information."},  
                                {"role": "assistant", "content": final_summary_part}  
                            ]  
                        )  
                        success = True  
                        backoff_time = 1  # Reset backoff time after a successful request  
                    except (RateLimitError, ServiceUnavailableError, APIError, TimeoutError, Exception) as e:
                        print(f'Rate limit or service unavailable, or timeout occurred, waiting for {backoff_time} seconds...')
                        time.sleep(backoff_time)  
                        backoff_time *= 2  # Double the wait time  

                final_summaries.append(chat['choices'][0]['message']['content'])

            short_or_not_summary = '\n'.join(final_summaries)

            # Save the summary with the new prompt in a text file
            summary_filename = f'ShortOrNotsummary_{pdf_name}.txt'
            with open(f'{summaries_dir}/{summary_filename}', 'w') as f:
                f.write(short_or_not_summary)

# Remove all other txt files except those starting with "ShortOrNotsummary" 
for file in os.listdir(summaries_dir):
    if file.endswith(".txt") and not (file.startswith("ShortOrNotsummary")):
        os.remove(os.path.join(summaries_dir, file))

for file in os.listdir(summaries_dir):
    Company_names[file]=extract_dynamic_string(file)

# Iterate through all files in the summaries directory
for file in os.listdir(summaries_dir):
    if file.endswith(".txt") and file.startswith("ShortOrNotsummary"):
        file_path = os.path.join(summaries_dir, file)
        with open(file_path, 'r') as f:
            lines = f.readlines()

        keep = False
        kept_lines = []
        counter = 1  # Add a counter to keep track of points
        for i in range(len(lines)):
            line = lines[i]
            
            # If the line is not a new paragraph, keep it only if 'keep' is True
            if line != "\n" and not keep:
                continue
            
            # Check if the line starts with a digit
            if line[0].isdigit():
                # Replace the initial digits with the counter value
                line = re.sub(r'^\d+', str(counter), line)
                counter += 1  # Increment the counter
                keep = True
            # Check if the line is a new paragraph
            elif line == "\n":
                # Check if the next line starts with a digit, if so, keep = True
                if i + 1 < len(lines) and lines[i + 1][0].isdigit():
                    keep = True
                else:
                    keep = False
                    
            # Keep the line if it's part of the relevant section
            kept_lines.append(line)
        with open(file_path, 'w') as f:
            centered_name = company_name.center(80)  # Adjust the width (80) as needed
            f.write(centered_name)
            f.write('\n')
            f.write('\n')
            for line in kept_lines:
                f.write(line)
                if line != '\n':  # To prevent two blank lines in a row
                    f.write('\n')

        with open(file_path, 'r') as f:
            text1 = f.read()
            text1=remove_non_numeric_paragraphs(text1)
        
        with open(file_path, 'w') as f:
            filnavn1 = os.path.basename(file_path)
            centered_name = Company_names[filnavn1].center(80)  # Adjust the width (80) as needed
            f.write(centered_name)
            f.write('\n')
            f.write('\n')
           
            i = 0
            for i in range(0, len(text1)):
                if i > 0 and i < len(text1) - 2 and text1[i].isdigit() and text1[i+1] == '.' and text1[i-1].isdigit() == False and text1[i+2].isdigit() == False:
                    f.write('\n')
                    f.write(text1[i])
                elif i > 0 and i < len(text1) - 3 and text1[i].isdigit() and text1[i+2] == '.' and text1[i+3].isdigit() == False and text1[i-1].isdigit() == False:
                    f.write('\n')
                    f.write(text1[i])
                else:
                    f.write(text1[i])
                

Rate limit or service unavailable, or timeout occurred, waiting for 1 seconds...
