In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import vertexai
from vertexai.language_models import TextGenerationModel
import configparser
from time import sleep

In [4]:
def scrape_condition_data(condition):
  edgar_df = pd.DataFrame(columns=['condition_name', 'form_and_file','cik', 'file_number', 'film_number', 'filing_entity', 'filed_on', 'content'])
  
  option = webdriver.ChromeOptions()
  option.add_argument('--disable-dev-shm-usage')
  driver = webdriver.Chrome(options=option)
  
  try:
    url = f"https://www.sec.gov/edgar/search/#/q={condition}&dateRange=1y"    
    counter = 0
    exception_count=0
    driver.get(url)
    
  except Exception as e:
    print(f"Error: {e}")
  finally:
    return edgar_df

In [5]:
def scraping_edgar_data():
    conditions = ['dermatitis', 'Eczema', 'Psoriasis', 'Rosacea', 'Vitiligo', 'Acne', 'Skin%20Cancer', 'Asthma', 'Pneumonia', 'Tuberculosis']
    
    with ThreadPoolExecutor(max_workers=len(conditions)) as executor:
        results = list(executor.map(scrape_condition_data, conditions))
    
    # Combine results from all threads
    combined_edgar_df = pd.concat([result[0] for result in results], ignore_index=True)
    exception_counts = [result[1] for result in results]
    
    return combined_edgar_df, exception_counts

In [6]:
url = f"https://www.sec.gov/edgar/search/#/q=dermatitis&dateRange=1y&filter_forms=10-K"

In [7]:
option = webdriver.ChromeOptions()
option.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=option)
driver.get(url)
sleep(10)

In [8]:
content= driver.page_source

In [9]:
content



In [10]:
parsed_content = BeautifulSoup(content, 'html.parser')
parsed_content

<html><head lang="en">
<meta charset="utf-8"/>
<meta content="IE=11" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>SEC.gov | EDGAR Full Text Search</title>
<!--CSS files-->
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.0/css/all.min.css" rel="stylesheet"/>
<link href="global/js/jquery-ui/jquery-ui.css" rel="stylesheet" type="text/css"/>
<link href="global/css/bootstrap/bootstrap.min.css" rel="stylesheet" type="text/css"/>
<link href="css/edgar_full_text_search.css" rel="stylesheet"/>
<link href="css/efts_media_queries.css" rel="stylesheet"/>
<link href="css/efts_filters.css" rel="stylesheet" type="text/css"/>
<!-- open source javascript libraries -->
<script async="" id="www-widgetapi-script" src="https://www.youtube.com/s/player/76c7a082/www-widgetapi.vflset/www-widgetapi.js" type="text/javascript"></script><script async="" src="https://www.googletagmanager.com/gtag/js?id=G-CSLL4ZEK4L&amp;l=dataLayer&

In [11]:
hits_div = parsed_content.find("div", attrs={"id":"hits"})

In [12]:
table = hits_div.find('table', class_='table')

In [13]:
rows = table.find('tbody').find_all('tr')
rows

[<tr><td class="filetype"><a class="preview-file" data-adsh="0001787306-24-000031" data-file-name="arqt-20231231.htm" href="#arqt-20231231.htm">10-K (Annual report) </a></td><td class="filed">2024-02-27</td><td class="enddate">2023-12-31</td><td class="entity-name">Arcutis Biotherapeutics, Inc.  (ARQT) </td><td class="cik d-none" nowrap="">CIK 0001787306</td><td class="biz-location located d-none" nowrap="">Westlake Village, CA</td><td class="incorporated d-none" nowrap="">Delaware</td><td class="file-num d-none" nowrap=""><a href="https://www.sec.gov/cgi-bin/browse-edgar/?filenum=001-39186&amp;action=getcompany">001-39186</a></td><td class="film-num d-none" nowrap="">24682024</td></tr>,
 <tr><td class="filetype"><a class="preview-file" data-adsh="0001140361-24-028145" data-file-name="ef20026301_10k.htm" href="#ef20026301_10k.htm">10-K (Annual report) </a></td><td class="filed">2024-05-30</td><td class="enddate">2024-03-31</td><td class="entity-name">Roivant Sciences Ltd.  (ROIV) </td>

In [14]:
row = rows[0]

In [15]:
cols = row.find_all('td')
cols

[<td class="filetype"><a class="preview-file" data-adsh="0001787306-24-000031" data-file-name="arqt-20231231.htm" href="#arqt-20231231.htm">10-K (Annual report) </a></td>,
 <td class="filed">2024-02-27</td>,
 <td class="enddate">2023-12-31</td>,
 <td class="entity-name">Arcutis Biotherapeutics, Inc.  (ARQT) </td>,
 <td class="cik d-none" nowrap="">CIK 0001787306</td>,
 <td class="biz-location located d-none" nowrap="">Westlake Village, CA</td>,
 <td class="incorporated d-none" nowrap="">Delaware</td>,
 <td class="file-num d-none" nowrap=""><a href="https://www.sec.gov/cgi-bin/browse-edgar/?filenum=001-39186&amp;action=getcompany">001-39186</a></td>,
 <td class="film-num d-none" nowrap="">24682024</td>]

In [16]:
row_data = [col.text.strip() for col in cols]
row_data

['10-K (Annual report)',
 '2024-02-27',
 '2023-12-31',
 'Arcutis Biotherapeutics, Inc.  (ARQT)',
 'CIK 0001787306',
 'Westlake Village, CA',
 'Delaware',
 '001-39186',
 '24682024']

In [17]:
link = row.find('a', class_='preview-file', href=True)
link

<a class="preview-file" data-adsh="0001787306-24-000031" data-file-name="arqt-20231231.htm" href="#arqt-20231231.htm">10-K (Annual report) </a>

In [18]:
link_href = link['href']
link_text = link.text

In [19]:
xpath = f"//a[@href='{link_href}' and contains(@class, 'preview-file') and text()='{link_text}']"


In [20]:
temp = driver.find_element(By.XPATH, xpath)
temp.click()

In [21]:
WebDriverWait(driver, 10).until(
  EC.presence_of_element_located((By.ID, 'ipreviewer'))
)

<selenium.webdriver.remote.webelement.WebElement (session="95e5e28fd71bf0343b3322567f1887f8", element="f.8B24BC81C9BEB70F798BF5B08CC961A9.d.DCBB8CFC4B6332A3DF0549C9F5D4B2F7.e.437")>

In [22]:
iframe = driver.find_element(By.ID, 'ipreviewer')
 

In [23]:
iframe_content = driver.page_source
iframe_soup = BeautifulSoup(iframe_content, 'html.parser')

In [24]:
iframe_text = iframe_soup.get_text()
iframe_text

"\n\n\n\nSEC.gov | EDGAR Full Text Search\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSEC.gov\n\n\n\nEDGAR\n\n\n\n\nFAQ\n\n\nOther search tools\n\n\n\nThe new EDGAR advanced search gives you access to the full text of electronic filings since 2001.\n\n\n\n\nDocument word or phrase?\n\n\n\xa0\n\nCompany name, ticker, CIK number or individual's name\n\n\n+ more search options\n\n\n\n\n\n\nFiling category\nBrowse filing types\n\n\nView all\n\n\nView all\xa0\xa0Exclude insider equity awards, transactions, and ownership (Section 16 Reports)All annual, quarterly, and current reportsInsider equity awards, transactions, and ownership (Section 16 Reports)Beneficial ownership reportsExempt offeringsRegistration statements and prospectusesFiling review correspondenceSEC orders and noticesProxy materialsTender offers and going private transactionsTrust indenture filingsEnter the filing types\n\n\n\nFiling typesBrowse filing types\n\n\n\ncancel\n\n\n\n\n\nFiled date range\n

In [25]:
iframe_soup

<html><head lang="en">
<meta charset="utf-8"/>
<meta content="IE=11" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>SEC.gov | EDGAR Full Text Search</title>
<!--CSS files-->
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.0/css/all.min.css" rel="stylesheet"/>
<link href="global/js/jquery-ui/jquery-ui.css" rel="stylesheet" type="text/css"/>
<link href="global/css/bootstrap/bootstrap.min.css" rel="stylesheet" type="text/css"/>
<link href="css/edgar_full_text_search.css" rel="stylesheet"/>
<link href="css/efts_media_queries.css" rel="stylesheet"/>
<link href="css/efts_filters.css" rel="stylesheet" type="text/css"/>
<!-- open source javascript libraries -->
<script async="" id="www-widgetapi-script" src="https://www.youtube.com/s/player/76c7a082/www-widgetapi.vflset/www-widgetapi.js" type="text/javascript"></script><script async="" src="https://www.googletagmanager.com/gtag/js?id=G-CSLL4ZEK4L&amp;l=dataLayer&

In [26]:
for unwanted in iframe_soup(['script', 'style', 'footer', 'nav', 'header', 'form', 'aside']):
    unwanted.extract()

In [27]:
import re

# Join the text from the page
text = iframe_soup.get_text(separator=' ', strip=True)

# Remove garbage content with regex (optional)
# For example, filtering out certain patterns:
cleaned_text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
cleaned_text = re.sub(r'[^\w\s,.!?]', '', cleaned_text)  # Remove non-word characters


In [28]:
cleaned_text

'SEC.gov  EDGAR Full Text Search The new EDGAR advanced search gives you access to the full text of electronic filings since 2001. Form Type 10K Refine search results by Entity 2 PETMED EXPRESS INC PETS CIK 0001040130 1 180 Life Sciences Corp. ATNF, ATNFW CIK 0001690080 1 AMGEN INC AMGN CIK 0000318154 1 ANAPTYSBIO, INC ANAB CIK 0001370053 1 ARTELO BIOSCIENCES, INC. ARTL, ATLEW CIK 0001621221 1 AVITA Medical, Inc. RCEL, AVHHL CIK 0001762303 1 Aadi Bioscience, Inc. AADI CIK 0001422142 1 AbCellera Biologics Inc. ABCL CIK 0001703057 1 AbbVie Inc. ABBV CIK 0001551152 1 Aclaris Therapeutics, Inc. ACRS CIK 0001557746 1 Adaptimmune Therapeutics PLC ADAP CIK 0001621227 1 Akoya Biosciences, Inc. AKYA CIK 0001711933 1 Aldeyra Therapeutics, Inc. ALDX CIK 0001341235 1 Allakos Inc. ALLK CIK 0001564824 1 Apogee Therapeutics, Inc. APGE CIK 0001974640 1 Arcutis Biotherapeutics, Inc. ARQT CIK 0001787306 1 Astria Therapeutics, Inc. ATXS CIK 0001454789 1 Azitra, Inc. AZTR CIK 0001701478 1 Bausch Health Co

In [38]:
element1 = iframe_soup.find(id='iefd5f175481c4427942822a30d50761e_16')
element2 = iframe_soup.find(id='iefd5f175481c4427942822a30d50761e_19')

In [39]:
element1, element2

(None, None)

In [36]:
count = 0
text = ''
for element in element1.find_all_next():
    # Stop if we reach element2
    if element == element2:
        break
    
    # Accumulate text from each element
    text += " " + element.get_text(strip=True)
    count += 1

print("Total elements found:", count)
print("Text content between elements:")
print(text)

AttributeError: 'NoneType' object has no attribute 'find_all_next'

In [48]:
config = configparser.ConfigParser()
config.read('configuration.properties')

['configuration.properties']

In [49]:
# vertex AI config
project = config['VertexAI']['project']
model_id = config['VertexAI']['model_id']
endpoint_id = config['VertexAI']['endpoint_id']
location = config['VertexAI']['location']
api_endpoint = config['VertexAI']['api_endpoint']
model_pretrained = config['VertexAI']['model_pretrained']

In [50]:
import os 
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/anshul/Projects/Genassis/edgar-RAG/Notebooks/GCPkey.json" 

In [51]:
vertexai.init(project=project, location=location)

In [44]:
parameters = {
  "max_output_tokens": 1024,
  "temperature": 0.9,
  "top_p": 1
}

In [45]:
model_llm = TextGenerationModel.from_pretrained(model_pretrained)

In [47]:
model_llm = model_llm.get_tuned_model(f"projects/{project}/locations/{location}/models/{model_id}")

In [54]:
question = "Detailed explanation of the text in sections and keep the nice structure in the explanation"

In [58]:
text = "tell me a pithy joke about a dermatologist"

In [72]:
response = model_llm.predict(
  f"""{text}""",
  **parameters
)
print(f"{response.text}")

 Why don't dermatologists make house calls? Because they don't want to leave their office, where the lighting is best.


In [60]:
print(response)

MultiCandidateTextGenerationResponse(text=' Why did the dermatologist get a headache? Because he was scratching his head trying to figure out what skin condition his patient had.', _prediction_response=Prediction(predictions=[{'content': ' Why did the dermatologist get a headache? Because he was scratching his head trying to figure out what skin condition his patient had.', 'citationMetadata': {'citations': []}, 'safetyAttributes': {'blocked': False, 'safetyRatings': [{'probabilityScore': 0.1, 'severityScore': 0.0, 'severity': 'NEGLIGIBLE', 'category': 'Dangerous Content'}, {'probabilityScore': 0.3, 'severityScore': 0.1, 'severity': 'NEGLIGIBLE', 'category': 'Harassment'}, {'probabilityScore': 0.2, 'severityScore': 0.1, 'severity': 'NEGLIGIBLE', 'category': 'Hate Speech'}, {'probabilityScore': 0.1, 'severityScore': 0.1, 'severity': 'NEGLIGIBLE', 'category': 'Sexually Explicit'}], 'scores': [0.1, 0.2, 0.1, 1.0, 0.3, 0.2, 0.1, 0.1, 0.6, 0.1, 0.2], 'categories': ['Death, Harm & Tragedy', 

In [40]:
# import vertexai
# from vertexai.preview.generative_models import GenerativeModel, Image

# vertexai.init(project=project, location=location)

# generative_multimodal_model = GenerativeModel("gemini-1.5-pro-002")

In [45]:
# response = generative_multimodal_model.generate_content([ f""" summarize the text \ntext:\n {text}"""])

# print(response)

candidates {
  content {
    role: "model"
    parts {
      text: "Arcutis Biotherapeutics is a commercial-stage biopharmaceutical company focused on developing and commercializing treatments for dermatological diseases. Their lead product, ZORYVE (roflumilast), is available in both cream and foam formulations.\n\nZORYVE cream 0.3% is approved for plaque psoriasis in individuals 6 years and older, with efforts underway to expand the indication to children as young as 2.  ZORYVE cream 0.15% has been submitted for approval for atopic dermatitis in individuals 6 and older, with a PDUFA date of July 7, 2024, and a subsequent sNDA planned for children aged 2-5 (0.05% formulation) pending the initial approval. ZORYVE foam 0.3% is approved for seborrheic dermatitis in individuals 9 and older.  Positive Phase 3 trials have been completed for ZORYVE foam in scalp and body psoriasis, with an sNDA planned for the second half of 2024.\n\nBeyond ZORYVE, Arcutis is developing ARQ-255, a topical JAK

In [1]:
from google.cloud import aiplatform


In [61]:
endpoint = aiplatform.Endpoint(endpoint_id)

In [69]:
instance_dict = {
    "prompt": f"{text}",
    "max_output_tokens": 1024,
    "temperature": 0.9,
    "top_p": 1
}

In [71]:
response = endpoint.predict(instances=[instance_dict], parameters={})

response

Prediction(predictions=[{'content': ' Why did the dermatologist get fired? Because he kept telling his patients they were ugly.', 'citationMetadata': {'citations': []}, 'safetyAttributes': {'blocked': False, 'safetyRatings': [{'probabilityScore': 0.1, 'severityScore': 0.1, 'severity': 'NEGLIGIBLE', 'category': 'Dangerous Content'}, {'probabilityScore': 0.8, 'severityScore': 0.6, 'severity': 'MEDIUM', 'category': 'Harassment'}, {'probabilityScore': 0.7, 'severityScore': 0.4, 'severity': 'MEDIUM', 'category': 'Hate Speech'}, {'probabilityScore': 0.1, 'severityScore': 0.1, 'severity': 'NEGLIGIBLE', 'category': 'Sexually Explicit'}], 'scores': [0.1, 0.7, 0.3, 0.3, 0.8, 0.2, 0.3, 0.2, 0.3, 0.5, 0.1, 0.5, 0.1], 'categories': ['Death, Harm & Tragedy', 'Derogatory', 'Finance', 'Health', 'Insult', 'Legal', 'Politics', 'Profanity', 'Public Safety', 'Religion & Belief', 'Sexual', 'Toxic', 'War & Conflict']}}], deployed_model_id='924145020703866880', metadata={'tokenMetadata': {'outputTokenCount':