In [None]:
import re
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
import fitz
import pdfplumber
import pandas as pd

def extract_text_and_tables_correctly(pdf_path):
    """Extracts text while ensuring tables are inserted in the correct position without duplication."""
    final_output = []  

    with fitz.open(pdf_path) as doc, pdfplumber.open(pdf_path) as pdf:
        for page_num in range(len(doc)):
            text_blocks = doc[page_num].get_text("blocks")  
            text_blocks.sort(key=lambda x: (x[1], x[0])) 

           
            table = pdf.pages[page_num].extract_table()

            final_text = []
            table_inserted = False
            table_text_set = set()  

            if table:
                df = pd.DataFrame(table[1:], columns=table[0])  
                table_text = df.to_string(index=False)
                table_text_lines = set(table_text.split("\n")) 

            for block in text_blocks:
                block_text = block[4].strip()

                
                if table and not table_inserted:
                    final_text.append("\n[Extracted Table]\n" + table_text)
                    table_inserted = True

               
                if block_text in table_text_lines:
                    continue

                final_text.append(block_text)  
            
            final_output.append("\n".join(final_text))

    return "\n".join(final_output)  

# IN PROGRESS(NOT FULLY COMPLETED)
def is_relevant_image(image):
    """Filters out table-like and text-heavy images, keeping only graphs and charts."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blurred, 50, 150)

    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # If an image has many structured boxes, it's likely a table
    table_like_contours = [cnt for cnt in contours if cv2.boundingRect(cnt)[2] > 100 and cv2.boundingRect(cnt)[3] > 20]
    if len(table_like_contours) > 10:
        return False  # Likely a table

    # Use OCR to check for too much text, which indicates a document or table
    text = pytesseract.image_to_string(gray)
    if len(text.strip()) > 50:
        return False  # Too much text, likely not a chart

    return True  # Likely a graph or chart

# IN PROGRESS(NOT FULLY COMPLETED)
def extract_graphs_and_charts(pdf_path, output_folder="extracted_graphs"):
    """Extracts and saves only graphs, pie charts, and relevant images."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    images = convert_from_path(pdf_path)
    image_paths = []

    for i, image in enumerate(images):
        img_path = os.path.join(output_folder, f"page_{i+1}.png")

        # Convert to OpenCV format
        open_cv_image = np.array(image)
        open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)

        if is_relevant_image(open_cv_image):
            image.save(img_path, "PNG")
            image_paths.append(img_path)
            print(f"\n[Graph/Chart from Page {i+1}]")
            display(image)  # Show the image


def parse_financial_data(text):
    revenue = re.findall(r"Revenues\s*[\$]\s*([\d,]+)", text)
    operating_income = re.findall(r"Operating income\s*[\$]\s*([\d,]+)", text)
    net_income = re.findall(r"Net income\s*[\$]\s*([\d,]+)", text)
    print("Extracted Revenues:", revenue)
    print("Extracted Operating Income:", operating_income)
    print("Extracted Net Income:", net_income)
    
    financial_data = {
        "revenue": revenue,
        "operating_income": operating_income,
        "net_income": net_income
    }
    return financial_data

def summarize_text(text):
    try:
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        max_chunk_size = 1000
        text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
        
        summary = ""
        for chunk in text_chunks:
            chunk_summary = summarizer(chunk, max_length=200, min_length=50, do_sample=False)
            if chunk_summary and isinstance(chunk_summary, list) and "summary_text" in chunk_summary[0]:
                summary += chunk_summary[0]["summary_text"] + " "
            else:
                print("Warning: Summarization did not return any results for a chunk.")
                
        if summary.strip() == "":
            return "No summary available."
        
        return summary.strip()
    except Exception as e:
        print(f"Error during summarization: {e}")
        return "Summarization failed."
def plot_financial_trends(financial_data):
    try:
        if financial_data["revenue"]:
            revenue_data = [int(value.replace(",", "")) for value in financial_data["revenue"]]
        else:
            print("No revenue data found to plot.")
            revenue_data = []
        
        if financial_data["operating_income"]:
            operating_income_data = [int(value.replace(",", "")) for value in financial_data["operating_income"]]
        else:
            print("No operating income data found to plot.")
            operating_income_data = []
        
        min_length = min(len(revenue_data), len(operating_income_data))
        revenue_data = revenue_data[:min_length]
        operating_income_data = operating_income_data[:min_length]
        if not revenue_data or not operating_income_data:
            print("Insufficient data for plotting financial trends.")
            return
        
        plt.figure(figsize=(10, 6))
        plt.plot(revenue_data, marker='o', label="Revenue")
        plt.plot(operating_income_data, marker='s', label="Operating Income")
        plt.legend()
        plt.title('Financial Trends (Revenue vs Operating Income)')
        plt.xlabel('Data Points (e.g., Quarters)')
        plt.ylabel('Amount ($ millions)')
        plt.show()
    except Exception as e:
        print(f"Error while plotting financial trends: {e}")

if __name__ == "__main__":
    pdf_path = r"C:\Users\Aaarat\Downloads\2024q1-alphabet-earnings-release-pdf.pdf"
    pdf_text = extract_text_and_tables_correctly(pdf_path)
    
    if pdf_text:
        financial_data = parse_financial_data(pdf_text)
        print("Financial Data Extracted:", financial_data)
        summary = summarize_text(pdf_text)
        print("Document Summary:", summary)
        plot_financial_trends(financial_data)
    else:
        print("No text extracted from PDF; please check the file.")


In [None]:
text = pdf_text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    processed_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        processed_sentences.append(' '.join(filtered_words))
    
    return processed_sentences

processed_sentences = preprocess_text(text)
print(processed_sentences)


In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

labels = ["Strength", "Weakness", "Opportunity", "Threat"]
swot_results = {"Strengths": [], "Weaknesses": [], "Opportunities": [], "Threats": []}

for sentence in processed_sentences:
    result = classifier(sentence, candidate_labels=labels)
    label = result['labels'][0]
    if label == "Strength":
        swot_results["Strengths"].append(sentence)
    elif label == "Weakness":
        swot_results["Weaknesses"].append(sentence)
    elif label == "Opportunity":
        swot_results["Opportunities"].append(sentence)
    elif label == "Threat":
        swot_results["Threats"].append(sentence)

print(swot_results)


In [None]:
def display_swot_results(swot_results):
    for category, sentences in swot_results.items():
        print(f"\n{category}:\n")
        for sentence in sentences:
            print(f"- {sentence}")

display_swot_results(swot_results)
