In [1]:
%%capture
!pip install yfinance wbdata datetime sec_api sec_edgar_downloader langchain
!pip install -U langchain-community
!pip install peft
!pip install sentence-transformers
!pip install faiss-gpu
!pip install faiss-cpu
!pip install bitsandbytes

In [2]:
import yfinance as yf
import wbdata
from datetime import datetime
from sec_edgar_downloader import Downloader
from datetime import datetime, timedelta
import os
import json
import requests
import pandas as pd

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from sec_api import ExtractorApi,QueryApi


# Initialize SEC downloader
dl = Downloader(company_name='Abhay',email_address="Abhaychourasiya945@gmail.com")

def get_stock_data(symbol, period="1mo"):
    """Extract stock data and convert to text format."""
    stock = yf.Ticker(symbol)
    data = stock.history(period=period)

    text_data = []
    for date, row in data.iterrows():
        text = f"On {date.date()}, {symbol} stock opened at {row['Open']:.2f}, "
        text += f"reached a high of {row['High']:.2f}, a low of {row['Low']:.2f}, "
        text += f"and closed at {row['Close']:.2f}. The trading volume was {row['Volume']}."
        text_data.append(text)

    return "\n".join(text_data)

def get_alpha_vantage_news(symbol):
    """Extract news data from Alpha Vantage and convert to text format."""
    url = f"https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers={symbol}&apikey=X4AM5ZR6E2WPLQ7C"
    response = requests.get(url)
    data = response.json()

    if 'feed' not in data:
        return "No news data available."

    text_data = []
    for article in data['feed'][:10]:  # Limit to 10 articles
        text = f"Title: {article['title']}\n"
        text += f"Date: {article['time_published']}\n"
        text += f"Summary: {article['summary']}\n"
        text += f"Sentiment: {article['overall_sentiment_label']} (score: {article['overall_sentiment_score']})\n\n"
        text_data.append(text)

    return "\n".join(text_data)

sec_api_key = 'bc5b314f1c389391c3eb8e724f07af3eb4ec7485fe1b4d0e1853cb8cb1e8e4c4'
def get_sec_data(ticker,type = '10-K'):
  queryApi = QueryApi(sec_api_key)
  query = {
      "query":f"ticker:{ticker} AND formType:\"{type}\"",
      "from": "0",
      "size": "1",
      "sort": [{"filedAt":{"order":"desc"}}]
  }
  filings = queryApi.get_filings(query)
  filing_url = filings["filings"][0]['linkToFilingDetails']

  extractorApi = ExtractorApi(sec_api_key)
  onea_text = extractorApi.get_section(filing_url,"1A","text")
  seven_text = extractorApi.get_section(filing_url,"7","text")

  combined_text = onea_text + "\n\n" + seven_text
  return combined_text
def company_overview(symbol):
    url = f'https://www.alphavantage.co/query?function=OVERVIEW&symbol={symbol}&apikey=X4AM5ZR6E2WPLQ7C'
    r = requests.get(url)
    data = r.json()
    text = ""
    for key,value in data.items():
        subtext = f"The {key} is {value}. "
        text += subtext
    return text
def get_balance_sheet(symbol):
    url = f'https://www.alphavantage.co/query?function=BALANCE_SHEET&symbol={symbol}&apikey=X4AM5ZR6E2WPLQ7C'
    r = requests.get(url)
    data = r.json()
    data = data['annualReports']
    text = ""
    for dict in data:
        for key,value in dict.items():
            subtext = f"The {key} is {value}. "
            text += subtext
    return text

In [3]:
def get_combine_data(symbol):
    overview = company_overview(symbol=symbol)
    balance_sheet = get_balance_sheet(symbol=symbol)
    stock_data = get_stock_data(symbol=symbol)
    news = get_alpha_vantage_news(symbol=symbol)
    sec_data = get_sec_data(ticker=symbol)

    return overview,balance_sheet,stock_data,news,sec_data
def load_embeddings():
    model_path = 'BAAI/bge-large-en-v1.5'
    model_kwargs = {"device":"cuda"}
    encode_kwargs = {"normalize_embeddings":True}
    embeddings = HuggingFaceEmbeddings(model_name = model_path,model_kwargs = model_kwargs,encode_kwargs = encode_kwargs)
    return embeddings
def get_vectorspace(symbol,embeddings):
    data = """### COMPANY OVERVIEW:
    {}

    ### BALANCE SHEET:
    {}

    ### STOCK DATA:
    {}

    ### STOCK NEWS:
    {}\

    ### SEC_DATA:
    {}"""

    overview,balace_sheet,stock_data,stock_news,sec_data = get_combine_data(symbol=symbol)
    data = data.format(overview,balace_sheet,stock_data,stock_news,sec_data)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,chunk_overlap = 500,length_function = len,is_separator_regex=False)
    split_data = text_splitter.create_documents([data])

    db = FAISS.from_documents(split_data,embedding=embeddings)
    retriever = db.as_retriever()
    return retriever
def get_context(symbol,query,embeddings):
    retriever = get_vectorspace(symbol=symbol,embeddings = embeddings)
    retrieved_doc = retriever.invoke(query)
    context = []
    for doc in retrieved_doc:
        context.append(doc.page_content)
    return context

prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>Summarize the following financial report. Include ALL key statistical data and metrics. Focus on:

1. Revenue and profit figures.
2. Year-over-year growth rates.
3. Profit margins.
4. Debt levels and ratios.
5. Market share.
6. Notable trends or changes.
7. predict Future Price Gain or Loss in percentage.
8. Summary should be concise and to the mark with relevant data.
9. Give Overall sentiment according to data provided with Up and down trend indication.
10. You are a good financial analyzer and analyze effectively.

Provide a comprehensive yet concise summary suitable for financial professionals.
User will give Context and Question according to which assistant have to produce Summary and Sentiment.<|eot_id|>

<|start_header_id|>user<|end_header_id|>
### Question:
{}
### Context:
{}

<|eot_id|>
### Response:
<|start_header_id|>assistant<|end_header_id|>
{}
<|eot_id|>
"""
def formating(example):
    contexts = example['context']
    questions = example['question']
    answers = example['answer']
    texts = []
    for context,question,answer in zip(contexts,questions,answers):
        text = prompt.format(question,context,answer)
        texts.append(text)
    return {'text':texts,}

In [20]:
def inference(context,question,model):
    inputs = tokenizer(
        [prompt.format(question,context,'')],
        return_tensors='pt'
    )
    inputs = inputs.to('cuda')
    output = model.generate(**inputs,max_new_tokens = 2048,use_cache = True,pad_token_id = tokenizer.eos_token_id)
    response = tokenizer.batch_decode(output)
    return response
def answer_extract(text):
    text = text[0]
    start_token = "Response:\n<|start_header_id|>assistant<|end_header_id|>\n\n<|eot_id|>\n"
    end_token = "<|eot_id|>"
    start_idx = text.find(start_token)
    if start_idx == -1:
        return None
    start_idx += len(start_token)
    
    while start_idx < len(text) and text[start_idx] in '\n':
        start_idx += 1
    
    end_idx = text.rfind(end_token)  
    if end_idx == -1:
        return None
    return text[start_idx:end_idx].strip()
def format_output(text):
    # Split the text into lines
    lines = text.split('\\n')
    for line in lines:
        print(line)
    

In [5]:
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Abhay06102003/Llama-3-FinanceAgent",token = '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')
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct",token = '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',load_in_4bit = True)
model = PeftModel.from_pretrained(model,"Abhay06102003/Llama-3-FinanceAgent")

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/741 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

In [6]:
model = model.merge_and_unload()
embeddings = load_embeddings()

  warn_deprecated(
2024-06-29 08:04:15.855188: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-29 08:04:15.855314: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-29 08:04:16.161581: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [32]:
symbol = 'AMZN'
question = "Explain the growth rate?"
context = get_context(symbol,question,embeddings)

In [33]:
def question_prompt(question):
    prompt = f"""Revenue and Profit Figures.Year-over-Year Growth rate.Debt level and all ratios. Market share and trend.Future Growth in percentages.Company details and {question}"""
    return prompt

In [34]:
resp = inference(context,question_prompt(question),model)

In [35]:
full_answer= answer_extract(resp)

In [36]:
full_answer

"**Revenue and Profit Figures:**\n\n* Net Sales:\n\t+ 2022: $513,983 million\n\t+ 2023: $574,785 million\n\t+ Year-over-year growth: 12%\n* North America:\n\t+ 2022: $315,880 million\n\t+ 2023: $352,828 million\n\t+ Year-over-year growth: 12%\n* International:\n\t+ 2022: $118,007 million\n\t+ 2023: $131,200 million\n\t+ Year-over-year growth: 11%\n* AWS:\n\t+ 2022: $80,096 million\n\t+ 2023: $90,757 million\n\t+ Year-over-year growth: 13%\n\n**Year-over-Year Growth Rate:**\n\n* North America: 12% (2022-2023)\n* International: 11% (2022-2023)\n* AWS: 13% (2022-2023)\n* Consolidated: 12% (2022-2023)\n\n**Debt Levels and Ratios:**\nNo debt information provided.\n\n**Market Share:**\nNot provided.\n\n**Notable Trends or Changes:**\n* Sales growth driven by increased unit sales, primarily by third-party sellers, advertising sales, and subscription services.\n* Continued focus on price, selection, and convenience for customers, including from shipping offers.\n* Changes in foreign exchange r

In [37]:
format_output(full_answer)

**Revenue and Profit Figures:**

* Net Sales:
	+ 2022: $513,983 million
	+ 2023: $574,785 million
	+ Year-over-year growth: 12%
* North America:
	+ 2022: $315,880 million
	+ 2023: $352,828 million
	+ Year-over-year growth: 12%
* International:
	+ 2022: $118,007 million
	+ 2023: $131,200 million
	+ Year-over-year growth: 11%
* AWS:
	+ 2022: $80,096 million
	+ 2023: $90,757 million
	+ Year-over-year growth: 13%

**Year-over-Year Growth Rate:**

* North America: 12% (2022-2023)
* International: 11% (2022-2023)
* AWS: 13% (2022-2023)
* Consolidated: 12% (2022-2023)

**Debt Levels and Ratios:**
No debt information provided.

**Market Share:**
Not provided.

**Notable Trends or Changes:**
* Sales growth driven by increased unit sales, primarily by third-party sellers, advertising sales, and subscription services.
* Continued focus on price, selection, and convenience for customers, including from shipping offers.
* Changes in foreign exchange rates affected net sales by $71 million in 2023.
