# MODEL BUILD

In [1]:
import os
import io
import json
import pandas as pd
from dotenv import load_dotenv
import PyPDF2
import time
from azure.storage.blob import ContainerClient
# from openai import AzureOpenAI

import google.generativeai as genai
import os

from concurrent.futures import ThreadPoolExecutor, as_completed

# Load environment variables
load_dotenv()

# Set up Azure Blob Storage connection
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
container_name = os.getenv("CONTAINER_NAME")  # Replace with your container name
container_client = ContainerClient.from_connection_string(
    conn_str=connection_string,
    container_name=container_name
)

# Function to extract text from a PDF file stream
def extract_text_from_pdf(file_stream):
    text = ""
    try:
        pdf_reader = PyPDF2.PdfReader(file_stream)
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
    return text

# Function to truncate text to fit within the token limit
# def truncate_text(input_text, max_tokens):
#     tokens = input_text.split()  # Naive tokenization by splitting on spaces
#     if len(tokens) > max_tokens:
#         truncated_text = ' '.join(tokens[:max_tokens])
#         return truncated_text
#     return input_text


def get_json(file_stream, k):

    genai.configure(api_key=os.getenv(f"GEMINI_API_KEY_{k}"))
    
    
    # Extract text from PDF
    input_context = extract_text_from_pdf(file_stream)

    instructions = """
     As a Financial Analyst, you will leverage your expertise to generate tailored Financial Analysis 
     Reports that cater to the specific requirements of clients. 
    Key Objectives:

    Extract and Present: Provide detailed figures for each metric listed above, ensuring accurate and up-to-date data.
    Categorize Clearly: Organize the metrics under their respective categories as outlined.
    Ensure Completeness: Verify that all relevant metrics are included and presented with sufficient context for accurate interpretation.
    Highlight Capital Expenditure: Ensure that capital expenditure-related metrics are prominently detailed and clearly separated from other categories.
    """
    model = genai.GenerativeModel(model_name="gemini-1.5-flash", system_instruction=instructions)
    client = model.start_chat()
    
    user_content = f"""
    To extract and organize financial metrics from the report, including those related to capital expenditure, profitability, liquidity, solvency, cash flow, and other key indicators. The metrics should be categorized and detailed according to their relevance to different aspects of financial analysis. 

    Operating Activities "Change in Working Capital",
            "Net Cash from Operating Activities"
    Investing Activities has "Acquisition of Fixed Assets & Intangibles",
            "Net Cash from Investing Activities"
    Financing Activities includes "Dividends Paid",
            "Cash from (Repayment of) Debt",
            "Net Cash from Financing Activities",
    Net Change includes  Net Change in Cash
    Metadata includes "Report Date", "Publish Date", "Source"
    Profitability Metrics like "EBITDA",
            "Gross Profit Margin",
            "Operating Margin",
            "Net Profit Margin",
            "Return on Equity",
            "Return on Assets",
            "Return On Invested Capital",
    Liquidity Metrics include Current Ratio
    Solvency Metrics like  "Total Debt", "Liabilities to Equity Ratio", "Debt Ratio",
    Cash Flow Metrics like "Free Cash Flow", "Free Cash Flow to Net Income", "Cash Return On Invested Capital",
    Other Important Metrics like "Piotroski F-Score", "Net Debt / EBITDA", "Dividend Payout Ratio
    Capital Expenditure 
    ```Document
    {input_context}
    ```
    """

    # JSON output structure request
    final_content = """
    Please provide the company name, year and quarter, capex value in billions of US dollars and no need of use $ symbol 
    in JSON format.
    Use the following JSON structure:
    ```json
    {
        "company": "",
        "year": "",
        "quarter": "",
        "capex": ""    
    }
    ```

    Example:
    ```json
    {
        "company": "abc",
        "year": "2023",
        "quarter": "3",
        "capex": "4.9"    
    }
    ```

    **Note: Only JSON Data is required, no other text is required.**
    """

    # Steps to find the capital expenditure value
    # how_content = """
    # Please provide the steps how you found or calculated the capital expenditure value.
    # """

    print(f"*************API - {k} - IN USE***************")
    retries = 1
    while retries <= 3:
        try:
            response = client.send_message(user_content)
            # print("1st response saved...")
            break
        except Exception as e:
            retries += 1
            if retries > 3:
                print("!!!!!Couldn't Resolve!!!!!!")
                return None
            print(f"Error Occoured for API {k}: {e}\n Retrying...{retries}")
            time.sleep(15)

    retries = 1
    while retries <= 3:
        try:
            # print("getting json...")
            response_json = client.send_message(final_content)
            print(f"done - {response_json.text}")
            # how_respose = client.send_message(how_content)
            return response_json.text # how_respose.text
        except Exception as e:
            retries += 1
            if retries > 3:
                print("!!!!!Couldn't Resolve!!!!!!")
                return None
            print(f"Error Occoured for API {k}: {e}\n Retrying...{retries}")
            time.sleep(15)

def process_blob(blob, k=1, retries=2,):
    attempt = 0
    while attempt < retries:
        try:
            stream = io.BytesIO()
            container_client.download_blob(blob).readinto(stream)
            op = get_json(stream, k)
            if op: #and how:
                op = json.loads(op[op.index("{"):len(op)-op[::-1].index("}")])
                if op.get("capex"):  # Check if capex is not empty
                    return op #, how
                else:
                    print(f"Capex is empty. Retrying for blob {blob.name}...")
            attempt += 1
            time.sleep(5)  # Wait before retrying
        except Exception as e:
            print(f"Error processing blob {blob.name}: {e}")
            return None#, None
    print(f"Max retries reached for blob {blob.name}.")
    return None#, None

# function to process blobs in parallel
def get_capex_info(companies):
    final_json = []
    blob_list = list(container_client.list_blobs())[:]  # Limit to the first 20 blobs
    # companies = companies
    # ["Alphabet", "Amazon", "Berkshire Hathaway", 
    #              "Cardinal Health", "Chevron", "Fannie Mae",
    #              "Goldman Sachs Group", "Valero Energy", 
    #              "Morgan Stanley","Tesla"]
    blob_list = [blob for blob in blob_list if blob.name.strip().split('/')[0] in companies]
    print(f"NUMBER OF FILES: {len(blob_list)}")
    completed = 0
    # Using ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=4) as executor:
        future_to_blob = {executor.submit(process_blob, blob, k%7): (blob, k) for k, blob in enumerate(blob_list)}
        # Iterate over completed futures
        for future in as_completed(future_to_blob):
            blob = future_to_blob[future]
            try:
                result = future.result()
                if result:
                    op = result
                    if op:  # Ensure op is not None
                        completed += 1
                        print(f"{completed} files completed.")
                        print(op, "\n\n") #, how)
                        final_json.append(op)
            except Exception as e:
                print(f"Error processing result for blob {blob.name}: {e}")

    # Ensure final_json contains valid data
    final_json = [item for item in final_json if isinstance(item, dict)]
    # Optional: Save final JSON to a file
    # output_file = "financial_analysis_results_1201.json"
    # with open(output_file, "w") as outfile:
    #     json.dump(final_json, outfile, indent=4)
    # final_json = json.loads(final_json)
    # Convert final_json to DataFrame and save to CSV
    if final_json:  # Check if final_json is not empty
        df = pd.DataFrame(final_json)
        # output_file = "financial_analysis_results10.csv"
        # df.to_csv(output_file, index=False)
        print(f"Total {completed} files completed.")
        print(f"Processing completed. Results saved to df.")
        return df, final_json
        # print(f"Processing completed. Results saved to {output_file}.")
    else:
        # print("No valid data to save to CSV.")
        print("No valid data to save to df.")
        return None, None



  from .autonotebook import tqdm as notebook_tqdm


# Cleaning the data 

In [2]:
#

# Read the original CSV file
# input_file = "all_1.csv"
# df = pd.read_csv(input_file)

def clean_df(df):
    df['company'] = df['company'].replace({
        'Amazon': 'Amazon',
        'Amazon.com, Inc.': 'Amazon',
        'Alphabet Inc':'Alphabet',
        'Amazon.com Inc.': 'Amazon',
        'Goldman Sachs' : 'Goldman Sachs Group',
        'The Goldman Sachs Group, Inc.': 'Goldman Sachs Group',
        'Goldman Sachs Group':'Goldman Sachs Group',
        'Tesla' : 'Tesla',
        'Tesla, Inc.':'Tesla',
        'Tesla Motors, Inc.': 'Tesla',
        'Tesla Motors':'Tesla',
        'Valero Energy Corporation': 'Valero Energy',
        'Cardinal Health':'Cardinal Health',
        'BERKSHIRE HATHAWAY INC.': 'Berkshire Hathaway',
        'Berkshire Hathaway Inc' : 'Berkshire Hathaway',
        '"Cardinal Health, Inc."':'Cardinal Health',
        'Chevron Corporation': 'Chevron',

    })

    df['capex'] = df['capex'].replace({ 'Not provided in the report':'-', 'nan':'-' ,'Not provided': '-','Not Provided': '-','Not mentioned in the report':'-','X':'-', 'Not available': '-',  'Not specified': '-', 'n/a': '-', 'nan': '-','X.XX': '-','Not provided in the Report':'-','Not Provided in the Report':'-','Not available in the provided document':'-', 'Not disclosed':'-'})
    df['company'] = df['company'].str.replace('"', ' ', regex=False)
    df['year'] = df['year'].replace({'FY16' :'2016', 'FY21':'2021', 'FY20':'2020', 'FY19':'2019', 'FY23':'2023', 'FY24':'2024', 'FY2024':'2024'})
    df['quarter'] = df['quarter'].replace({'First Quarter': 'Q1', '2': 'Q2', 'Second Quarter': 'Q2', 'Third Quarter': 'Q3',  'Fourth Quarter' : 'Q4','Fourth':'Q4','fourth':'Q4', 'Second':'Q2','4': 'Q4','First':'Q1', '4Q':'Q4','4':'Q4', 'Third':'Q3', '1Q':'Q1', 1:'Q1', 2:'Q2', 3:'Q3', 4:'Q4','1':'Q1', '2':'Q2', '3':'Q3', '4':'Q4'})
    # Save the rearranged DataFrame to a new CSV file
    # output_file = "cleaned_financial_analysis_results1.csv"
    # df.to_csv(output_file, index=False)
    # output_file = df
    print(f"cleaned data saved")# to {output_file}.")
    return df

## Rearranged input capex with quarter_year with company name 

# Read the original CSV file
# input_file = "cleaned_financial_analysis_results1.csv"
# df = pd.read_csv(input_file)
# input_file = output_file
# df = input_file
def rearrange_df(df):
    # Convert 'quarter' and 'year' columns into a single 'Quarter-Year' column with an underscore
    df['Quarter-Year'] = df['quarter'] + "_" + df['year'].astype(str)

    # Pivot the DataFrame
    pivot_df = df.pivot_table(index='company', columns='Quarter-Year', values='capex', aggfunc='first')

    # Reset the index to make 'company' a column instead of an index
    pivot_df.reset_index(inplace=True)

    # Define a function to sort columns in the desired order
    def sort_columns(df):
        # Extract the current columns
        columns = df.columns.tolist()

        # Extract the company column and the Quarter-Year columns
        company_col = columns[0]
        quarter_cols = columns[1:]

        # Generate sorted columns list: Start with the most recent quarters
        sorted_quarters = sorted(quarter_cols, key=lambda x: (x.split('_')[1], x.split('_')[0]), reverse=True)

        # Combine sorted columns with the company column
        sorted_columns = [company_col] + sorted_quarters
        return sorted_columns

    # Reorder the columns
    sorted_columns = sort_columns(pivot_df)
    pivot_df = pivot_df[sorted_columns]
    pivot_df['company'].unique()
    pivot_df['company'] = pivot_df['company'].str.replace('"', ' ', regex=False)
    # # Save the rearranged DataFrame to a new CSV file
    # output_file = pivot_df
    # "rearranged_financial_analysis_results1.csv"
    # pivot_df.to_csv(output_file, index=False)

    print(f"Rearranged data saved")# to {output_file}.")
    return pivot_df

def final_process(df):
    # Define a function to compare capex values between consecutive columns
    def compare_columns(row):
        comparisons = []
        for i in range(1, len(row) - 1):  # Skip the 'company' column
            current_value = row[i]
            next_value = row[i + 1]

            if pd.isna(current_value) or pd.isna(next_value):
                comparisons.append("DNA")#("Data not available")
            else:
                try:
                    current_value = float(current_value)
                    next_value = float(next_value)
                    if next_value < current_value:
                        comparisons.append("Increase")
                    elif next_value > current_value:
                        comparisons.append("Decrease")
                    else:
                        comparisons.append("Unchanged")
                except ValueError:
                    comparisons.append("DNA")#("Data not available")

        return comparisons

    # Apply the comparison function to each row
    comparison_results = df.apply(lambda row: compare_columns(row), axis=1)

    # Replace the original values in the DataFrame with the comparison results
    for i, col in enumerate(df.columns[1:-1]):  # Exclude 'company' and the last quarter
        df[col] = comparison_results.apply(lambda x: x[i])

    # # Save the comparison results to a new CSV file
    # output_file = "capex_comparison_results.csv"
    # df.to_csv(output_file, index=False)

    print(f"Capex comparison results saved") #to {output_file}.")
    return df

# Pipeline

In [3]:

company_df = []
raw_company_df = []
raw_json = []


In [4]:
# all_companies = ["Alphabet", "Amazon", "Berkshire Hathaway", 
#              "Cardinal Health", "Chevron", "Fannie Mae",
#              "Goldman Sachs Group", "Valero Energy", 
#              "Morgan Stanley","Tesla"]

all_companies = [
"Alphabet",
"Amazon",
"Berkshire Hathaway",
"Cardinal Health",
"Centene",
"Chevron",
"Comcast",
"ExxonMobil",
"Tesla",
"UnitedHealth Group",
"Valero Energy",
"Walgreens Boots Alliance",
"Walmart"
]
for company in all_companies[:1]:
    print(f"{'#'*10}processing--{company}{'#'*10}")
    csv_df, json_data = get_capex_info([company])
    raw_json.append(json_data)
    cleaned_df = clean_df(csv_df)
    rearranged_df = rearrange_df(cleaned_df)
    rearranged_df.to_csv(f"{company}.csv")
    raw_company_df.append(rearranged_df.copy(deep=True))
    final_df = final_process(rearranged_df)
    final_df.to_csv(f"final_{company}.csv")
    company_df.append(final_df.copy(deep=True))
    print(f"{'$'*10}processed--{company}{'$'*10}")
# company = "Alphabet"

##########processing--Alphabet##########
NUMBER OF FILES: 14
*************API - 0 - IN USE***************
*************API - 1 - IN USE***************
*************API - 2 - IN USE***************
Error Occoured for API 0: 429 Quota exceeded for quota metric 'Generate Content API requests per minute' and limit 'GenerateContent request limit per minute for a region' of service 'generativelanguage.googleapis.com' for consumer 'project_number:724959239723'. [reason: "RATE_LIMIT_EXCEEDED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "quota_metric"
  value: "generativelanguage.googleapis.com/generate_content_requests"
}
metadata {
  key: "quota_location"
  value: "us-west4"
}
metadata {
  key: "quota_limit"
  value: "GenerateContentRequestsPerMinutePerProjectPerRegion"
}
metadata {
  key: "quota_limit_value"
  value: "0"
}
metadata {
  key: "consumer"
  value: "projects/724959239723"
}
, links {
  description: "Request 