In [5]:
import yfinance as yf
import pandas as pd
import requests
import os

In [14]:
START_DATE = "2005-01-01"
END_DATE = "2025-12-31" 
BASE_DIR = os.path.dirname(os.getcwd())
OUTPUT_DIR = os.path.join(BASE_DIR, "data")
OUTPUT_FILE_PATH = os.path.join(OUTPUT_DIR, "massive_financial_data.csv")

WIKI_URL = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [11]:
def get_sp500_tickers():
    try:
        response = requests.get(WIKI_URL, headers=headers)
        response.raise_for_status()
        df = pd.read_html(response.text)[0]
        tickers = [ticker.replace('.', '-') for ticker in df['Symbol'].tolist()]
        return tickers
    except Exception:
        return "Error al recopilar los datos" 

SP500_TICKERS = get_sp500_tickers()
MARKET_INDICES = ["^GSPC", "^VIX"] 
ALL_TICKERS = SP500_TICKERS + MARKET_INDICES

  df = pd.read_html(response.text)[0]


In [15]:
def download_massive_data(tickers, start, end, output_path):
    print(f"Descargando {len(tickers)} series desde {start}...")
    
    data = yf.download(
        tickers, 
        start=start, 
        end=end, 
        auto_adjust=True, 
        group_by='ticker'
    )
    
    if data.empty:
        return

    data = data.dropna(axis=1, how='all')
    
    os.makedirs(os.path.dirname(output_path), exist_ok=True) 
    
    data.to_csv(output_path) 
    
    print(f"Descarga finalizada. Datos guardados en CSV en: {output_path}")

In [16]:
download_massive_data(ALL_TICKERS, START_DATE, END_DATE, OUTPUT_FILE_PATH)

Descargando 505 series desde 2005-01-01...


[*********************100%***********************]  505 of 505 completed


Descarga finalizada. Datos guardados en CSV en: c:\Users\Usuario\Desktop\Code\bachelor-thesis\data\massive_financial_data.csv
