In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
from tqdm import tqdm

from functions import *
import os
from datetime import datetime

from pathlib import Path
os.chdir(Path.cwd().parents[0]) # Set path as if it was in root

In [None]:
root = 'data/preprocessed/'
fund_path = root + f'fundamentals_{datetime.now().strftime('%Y-%m')}.csv'

fund_df = load(fund_path)
fund_df['funds_date'] = pd.to_datetime(fund_df['funds_date'])

In [None]:
def scrape_etf_profile(isin):
    url = f"https://www.justetf.com/en/etf-profile.html?isin={isin}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    data = {'isin': isin}

    etf_title_tag = soup.find('h1', id='etf-title')
    data['name'] = etf_title_tag.text.strip() if etf_title_tag else None
    
    # Get WKN and Ticker
    wkn_tag = soup.find('span', id='etf_identifier_1')
    data['wkn'] = wkn_tag.text.strip() if wkn_tag else None
    ticker_tag = soup.find('span', id='etf_identifier_2')
    data['ticker'] = ticker_tag.text.strip() if ticker_tag else None

    # Scrape Labels (e.g., "Savings plan", "ETF")
    tags_list = []
    tags_container = soup.find('div', class_='m-xs-sep-m')
    if tags_container:
        tags = tags_container.find_all('span', class_='label')
        # Extract clean text from each tag
        tags_list = [tag.get_text(strip=True) for tag in tags]
    data['tags'] = tags_list

    # Scrape the Main Overview Section
    # This replaces the failing get_value_from_overview() function with a more robust method.
    overview_section = soup.find('div', class_='data-overview')
    if overview_section:
        # Find all pairs of labels and values
        for item in overview_section.find_all('div', class_='d-flex-column'):
            label_tag = item.find('div', class_='vallabel')
            value_tag = item.find('div', class_='val')
            if label_tag and value_tag:
                # Clean up the label to use as a dictionary key
                label_key = label_tag.get_text(strip=True).lower().replace(' ', '_')
                value = value_tag.get_text(separator=' ', strip=True)
                data[label_key] = value

    # Scrape Data from the "Basics" Table (Corrected Logic)
    basics_section = soup.find('div', id='basics')
    if basics_section:
        table = basics_section.find('table', class_='etf-data-table')
        if table:
            rows = table.find_all('tr')
            for row in rows:
                # A valid row should have two cells: a label and a value
                cells = row.find_all('td')
                if len(cells) == 2:
                    label_tag = cells[0]
                    value_tag = cells[1] # Correctly select the second cell for the value
                    
                    label = label_tag.text.strip().lower().replace(' ', '_').replace('/', '_')
                    
                    # The value is often nested inside another div within the td
                    value_div = value_tag.find('div', class_='val') or value_tag.find('span', class_='val2')
                    value = value_div.text.strip() if value_div else value_tag.text.strip()

                    # Add specific key info to our data dictionary
                    if 'fund_currency' in label:
                        data['fund_currency'] = value
                    elif 'volatility_1_year' in label:
                        data['volatility_1y'] = value
                    elif 'fund_domicile' in label:
                        data['domicile'] = value
                    elif 'fund_provider' in label:
                        data['provider'] = value
                        
    return data

In [None]:
scraped_path = root + f'justetf_{datetime.now().strftime('%Y-%m')}.csv'
try:
    scraped_df = load(scraped_path)
except Exception:
    pass

scraped_data = []
missing = fund_df[~fund_df['isin'].isin(scraped_df['isin'])]['isin']
for isin in tqdm(missing, total=len(missing)):
    try:
        result = scrape_etf_profile(isin)
        scraped_data.append(result)
    except Exception as e:
        print(e)
        break
    sleep(2)

if 'scraped_df' in globals():
    df = pd.DataFrame(scraped_data)
    scraped_df = pd.concat([scraped_df, df], ignore_index=True).drop_duplicates(subset='isin')
else:
    scraped_df = pd.DataFrame(scraped_data)
scraped_df

In [None]:
scraped_path = root + f'justetf_{datetime.now().strftime('%Y-%m')}.csv'
scraped_df.to_csv(scraped_path, index=False)