In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from time import sleep
import pandas as pd
from datetime import datetime, timedelta

In [None]:
file_input = f""
file_output = f""

In [None]:
def fetch_data(stock_code, year):
    url = f"https://www.investsite.com.br/informacoes_periodicas_detalhe.php?cod_negociacao={stock_code}&ano={year}&categoria=Fato%20Relevante&ordem_data=2"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    return soup.find_all("td", class_="esquerda")

def extract_data(stock_code, reference_date):
    current_year = reference_date.year
    previous_year = current_year - 1
    tds = fetch_data(stock_code, current_year)

    # Check if the date is close to the beginning of the year and might need data from the previous year
    if reference_date.month == 1 and reference_date.day <= 14:
        tds += fetch_data(stock_code, previous_year)  # Append data from the previous year

    data = []
    for i in range(0, len(tds), 6):
        event_date_str = tds[i].get_text(strip=True)
        event_date = datetime.strptime(event_date_str, '%d/%m/%Y')

        # Check if event_date is within two weeks of the reference_date
        if abs((event_date - reference_date).days) <= 14:
            event_date2_str = tds[i+1].get_text(strip=True)
            link = tds[i+2].find('a')['href']
            content = tds[i+5].get_text(strip=True)
            data.append([event_date_str, event_date2_str, link, content])

    return data

In [None]:
df = pd.read_csv(file_input)
df['dates'] = pd.to_datetime(df['dates'], format='%Y-%m-%d %H:%M:%S')
df.head()

In [None]:
with open(file_output, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    headers = list(df.columns) +  ['fr_date1', 'fr_date2', 'fr_link', 'fr_subject']
    writer.writerow(headers)

    for index, row in df.iterrows():

        try:
            stock_code = row['cod']  # 'cod' is the company stock_code
            reference_date = row['dates']  
            sleep(2)  #  limit request rate
            extracted_data = extract_data(stock_code, reference_date)
            for date1, date2, link, text in extracted_data:
                new_row = list(row) + [date1, date2, link, text]
                writer.writerow(new_row)
            
            if index % 50 == 0: 
                print(f"Progress... {index} rows processed.")
        except Exception as e:
            print(f"Error: {e}")
            print(row)
            print()
            sleep(5)  
            continue #try next row

print(f"Data extracted and saved in {file_output}")
