In [None]:
pip install sec_downloader transformers streamlit streamlit_modal

In [None]:
#Import dependencies
from sec_downloader import Downloader
from sec_downloader.types import RequestedFilings
from bs4 import BeautifulSoup
from transformers import pipeline
import os
import re
import pandas as pd

In [None]:
#Helper functions

summarizer = pipeline("summarization",model='facebook/bart-large-cnn')
analyser=pipeline('sentiment-analysis',model='ProsusAI/finbert',top_k=3)
dl = Downloader('MSFT', "arvind.kr.200202@gmail.com")

def extract_text(url, yr, ticker):
    '''
    given an URL it Extract text from it using beautiful soup
    '''
    soup = BeautifulSoup(dl.download_filing(url=url).decode(), 'html.parser')

    #create directory to store txt files
    if not os.path.exists(ticker):
        os.makedirs(ticker)

    # Extract text
    text = soup.get_text()

    # Save text to a .txt file
    with open(f"{ticker}/{yr}.txt", "w") as file:
        file.write(text)
        file.write("\n<EOF>")  # Write EOF marker

def split_content(max_part_words,overlap_words,content):
  '''
  Given a content, It split it into parts each having 'max_part_words' words and adjacent parts will have 'overlap_words' words overlapping between them
  '''
  # Use regex to split the content into words, also cleans the content
  words = re.findall(r'\w+', content)
  parts = []
  part_content = ""
  # Iterate over words and split into parts with overlap
  for i, word in enumerate(words):
      # If adding the current word exceeds the max_part_words, append part_content to parts
      if len(part_content.split()) + 1 > max_part_words:
          parts.append(part_content)
          # Adjust the start index for the next part considering overlap
          start_index = max(0, i - overlap_words)
          # Reset part_content with overlapping words
          part_content = " ".join(words[start_index:i+1]) + " "
      else:
          part_content += word + " "
  # Append the last part_content to parts
  if part_content:
      parts.append(part_content)

  return parts

def extract_score(given_label,data):
  '''
  Given a dictionary of label and scores, It return score cooresponding to the given_label
  '''
  score = None
  for item in data[0]: #iterate over all labels
      if item['label'] == given_label: #match found
          score = item['score']
          break
  return score

def change_keys(dic):
  #changing key with all 4 digits in the yr
  l=list(dic.keys())
  mod_dic={}
  for i in l:
    if int(i)<25:
      mod_dic['20'+i]=dic[i]
    else:
      mod_dic['19'+i]=dic[i]
  return mod_dic

def extract_summary_and_sentiment(ticker):
  '''
  Given a ticker, it return a combined summary of all important segements, accross different years.
  To find important segments, the content is split into chunks and whichever chunk has a neutral score <0.5 is considered important
  Since the content is huge, all lot of it would tend to be neutral, So we modify sentiment score measurement to take averge only over important segements and leave out neutral segments
  '''
  dict_summary={}
  dict_senti={}
  #iterate over all files of the company
  for file in os.listdir('/content/'+ticker):
        #extract content from files
        with open((os.path.join('/content/'+ticker, file)),'r') as fi:
                  content = fi.read()
        summaries=content
        a=400
        b=50
        parts=split_content(a,b,summaries) #split it into chunks of suitable size
        summaries = ''
        sentiment=[]
        for i, part in enumerate(parts):
              try:
                senti = analyser(part) #find sentiment score
                if extract_score('neutral',senti)<0.5: #filtering out important segemnts
                  summaries+=summarizer(part,max_length=80,min_length=40)[0]['summary_text'] #finding sumary
                  #appending sentiment scores
                  sentiment.append(extract_score('positive',senti))
                  sentiment.append(-1*extract_score('negative',senti))
              except:
                continue
        #appending to dictionary with key as last two digits of year
        if sentiment:
          dict_senti[file[:2]]=sum(sentiment)/(len(sentiment)//2) #calculating overall sentiment score, giving equal weightage to all segments
        else: dict_senti[file[:2]]=0
        dict_summary[file[:2]]=summaries


  dict_summary=change_keys(dict_summary)
  dict_senti=change_keys(dict_senti)
  return dict_summary, dict_senti

def combine_yrs(dic):
  # combining summary of all the years
  summary=''
  for i in dic:
    if dic[i]:
      summary+= " "+ i+' : ('+dic[i]+")"
  return summary


def extract_positives_and_negatives(summary):
  '''
  Given the combined summary, it further summarises it, and return the positves and negatives seperately
  '''
  a=400
  b=50
  parts=split_content(a,b,summary)  #split it into chunks of suitable size
  positives = ''
  negatives=''
  for i, part in enumerate(parts):
        try:
          senti = analyser(part) #find sentiment score
          if extract_score('neutral',senti)<0.5: #filtering out important segemnts
            if extract_score('positive',senti)>extract_score('negative',senti): #checking for positive
              positives+=summarizer(part,max_length=80,min_length=40)[0]['summary_text']
            else:
              negatives+=summarizer(part,max_length=80,min_length=40)[0]['summary_text']
        except:
          continue

  return positives, negatives



# Download files

In [None]:
for ticker in ['AAPL','MSFT']:
  metadatas = dl.get_filing_metadatas(
      RequestedFilings(ticker_or_cik=ticker, form_type="10-K", limit=29)
  )
  for i in metadatas:
    try:
      extract_text(i.primary_doc_url,i.accession_number.split('-')[1],ticker)
    except:
      continue

# Summarise, combine and get insights

In [None]:
for ticker in ['AAPL','MSFT']:
  globals()[f'{ticker}_yrly_summary'],globals()[f'{ticker}_yrly_sentiment']=extract_summary_and_sentiment(ticker)
  globals()[f'{ticker}_overall_summary']=combine_yrs(globals()[f'{ticker}_yrly_summary'])
  globals()[f'{ticker}_positives'],globals()[f'{ticker}_negatives']=extract_positives_and_negatives(globals()[f'{ticker}_overall_summary'])

In [None]:
#Store text file to access in app.py
with open(f"AAPL_positives.txt", "w") as file:
        file.write(AAPL_positives)
with open(f"AAPL_negatives.txt", "w") as file:
        file.write(AAPL_negatives)
with open(f"MSFT_positives.txt", "w") as file:
        file.write(MSFT_positives)
with open(f"MSFT_negatives.txt", "w") as file:
        file.write(MSFT_negatives)

In [None]:
#Store CSV file to access in app.py
df=pd.DataFrame(AAPL_yrly_sentiment.items(), columns=['Year', 'Value'])
df=df.sort_values(by='Year')
df.to_csv('AAPL_yrly_sentiment.csv')
df=pd.DataFrame(MSFT_yrly_sentiment.items(), columns=['Year', 'Value'])
df=df.sort_values(by='Year')
df.to_csv('MSFT_yrly_sentiment.csv')

# App

In [None]:
!wget -q -O - ipv4.icanhazip.com

35.227.76.194


In [None]:
! streamlit run app.py & npx localtunnel --port 8501