In [20]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import itertools

In [23]:

def get_page_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve {url}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_links(base_url, content):
    soup = BeautifulSoup(content, 'html.parser')
    links = set()  # Using a set to avoid duplicate links
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        full_url = urljoin(base_url, href)  # Join relative URLs with the base URL
        links.add(full_url)
    return links

def extract_information(url):
    content = get_page_content(url)
    if content:
        soup = BeautifulSoup(content, 'html.parser')
        page_text = soup.get_text(separator='\n').strip()  # Extract plain text from the page
        print(f"Extracted content from {url}:\n{page_text[:200]}...")  # Printing first 200 chars for demo
        return page_text
    return None

def process_website(base_url):
    content = get_page_content(base_url)
    if content:
        links = extract_links(base_url, content)
        print(f"Found {len(links)} links on {base_url}")
        small_set = set(itertools.islice(links, 12))
        print(f"Updated {len(small_set)} links on {base_url}")
        
        for link in small_set:
            print(f"Processing {link} ...")
            extract_information(link)




In [24]:
# Example usage
website_url = 'https://finance.yahoo.com/quote/ADS.DE/'  # Replace with your desired website
process_website(website_url)

Found 366 links on https://finance.yahoo.com/quote/ADS.DE/
Updated 12 links on https://finance.yahoo.com/quote/ADS.DE/
Processing https://finance.yahoo.com/sectors/communication-services/ ...
Extracted content from https://finance.yahoo.com/sectors/communication-services/:
Communication Services Stock Performance - Yahoo Finance
 








 
 
 
 
 
 
 
 
 
 
 
 
 
 
News
 
 
Today's news
 
 
 
US
 
 
 
Politics
 
 
 
World
 
 
 
Tech
 
 
Reviews and deals 
 
Audio 
 
Co...
Processing https://finance.yahoo.com/topic/tech/ ...
Extracted content from https://finance.yahoo.com/topic/tech/:
Latest Technology News and Reviews - Yahoo Finance
 
 
 
 
 
 
 
 
 
 
News
 
 
Today's news
 
 
 
US
 
 
 
Politics
 
 
 
World
 
 
 
Tech
 
 
Reviews and deals 
 
Audio 
 
Computing 
 
Gaming 
 
Hea...
Processing https://de.yahoo.com/?p=dnr ...
Extracted content from https://de.yahoo.com/?p=dnr:
Yahoo | Mail, Wetter, Suche, Nachrichten, Finanzen, Sport & mehr










 
 
 
 
Make Yahoo Your Homepage


KeyboardInterrupt: 

In [6]:
# Extract hyperlinks from the webpage
website_url = 'https://www.boerse-frankfurt.de/news/boerse-frankfurt-news'
content = get_page_content(website_url)
if content:
    links = extract_links(website_url, content)
    print(f"Extracted {len(links)} links from {website_url}")
    for link in links:
        print(link)

Extracted 50 links from https://www.boerse-frankfurt.de/news/boerse-frankfurt-news
https://www.deutsche-boerse.com/dbg-en/media/press-releases
https://www.boerse-frankfurt.de/en/first-steps
https://www.boerse-frankfurt.de/sustainability
https://www.boerse-frankfurt.de/en/exchange-for-students
https://www.boerse-frankfurt.de/en/disclaimer-en
https://www.boerse-frankfurt.de/registration
https://www.facebook.com/frankfurtexchange/
https://www.boerse-frankfurt.de/en/kontakt
https://www.boerse-frankfurt.de/en/know-how
https://www.boerse-frankfurt.de/en/en/kontakt
https://www.boerse-frankfurt.de/equities
https://itunes.apple.com/de/app/b%C3%B6rse-frankfurt-app/id378816763?mt=8&l=en
https://www.boerse-frankfurt.de/events
https://www.boerse-frankfurt.de/en/securities
https://www.boerse-frankfurt.de/en/know-how/glossary
https://www.boerse-frankfurt.de/en/etfs
https://youtube.com/boersefrankfurt
https://www.boerse-frankfurt.de/commodities
https://www.boerse-frankfurt.de/funds
https://www.boerse-

In [10]:
import pandas as pd

def extract_table_content(content):
    soup = BeautifulSoup(content, 'html.parser')
    tables = soup.find_all('table')
    
    if not tables:
        print("No tables found in the content.")
        return None
    
    # Assuming we want the first table
    table = tables[0]
    df = pd.read_html(str(table))[0]
    return df

# Extract table content from the webpage
# table_df = extract_table_content(content)
# if table_df is not None:
#     print(table_df)

In [15]:
def extract_table_content(content):
    soup = BeautifulSoup(content, 'html.parser')
    tables = soup.find_all('table')
    
    if not tables:
        print("No tables found in the content.")
        return None
    
    # Extract all tables
    all_tables = []
    for table in tables:
        print(table)
        df = pd.read_html(str(table))[0]
        all_tables.append(df)
    
    return all_tables

# Extract all table contents from the webpage
all_tables = extract_table_content(content)
if all_tables:
    for i, table_df in enumerate(all_tables):
        print(f"Table {i+1}:")
        print(table_df)

<table class="table widget-table"><thead><tr><th class="widget-table-header-cell">Time</th><th class="widget-table-header-cell">Title</th></tr></thead><!-- --></table>
Table 1:
Empty DataFrame
Columns: [Time, Title]
Index: []


  df = pd.read_html(str(table))[0]


In [13]:
website_url = 'https://www.boerse-frankfurt.de/news/boerse-frankfurt-news'  # Replace with your desired website

extract_table_content(get_page_content(website_url))

No tables found in the content.


In [17]:
# Example usage
website_url = 'https://finance.yahoo.com/quote/ADS.DE/'  # Replace with your desired website
process_website(website_url)

Found 366 links on https://finance.yahoo.com/quote/ADS.DE/
{'https://de.yahoo.com/?p=dnr', 'https://sports.yahoo.com/syndication/', 'https://uk.yahoo.com/?p=dnr', 'https://finance.yahoo.com/markets/stocks/trending/', 'https://finance.yahoo.com/quote/BMW.DE/', 'https://finance.yahoo.com/quote/SIE.DE/', 'https://finance.yahoo.com/quote/GC%3DF/', 'https://finance.yahoo.com/sectors/communication-services/', 'https://finance.yahoo.com/topic/tech/', 'https://finance.yahoo.com/quote/ADS.DE/history/', 'https://sports.yahoo.com/nba/players/', 'https://hockey.fantasysports.yahoo.com/', 'https://finance.yahoo.com/quote/DECK/', 'https://finance.yahoo.com/sectors/consumer-defensive/', 'https://sports.yahoo.com/nba/', 'https://finance.yahoo.com/quote/ADS.DE/key-statistics/', 'https://www.yahoo.com/tech/gaming/', 'https://sports.yahoo.com/soccer/serie-a/', 'https://sports.yahoo.com/mlb/news/', 'https://www.yahoo.com/entertainment/', 'https://www.yahoo.com/entertainment/tagged/videos/', 'https://www.y

In [1]:
# Example usage
website_url = 'https://finance.yahoo.com/quote/ADS.DE/news/'  # Replace with your desired website
process_website(website_url)

NameError: name 'process_website' is not defined

In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [5]:
sentiment_pipeline.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 