In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import time
import re
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "notebook_connected"

In [7]:
BASE_URL_TEMPLATE = "https://dataprivacy.com.br/category/noticias/page/{}/"
DATABASE_NAME = "dataprivacy.db"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

print("Configura√ß√£o pronta!")

Configura√ß√£o pronta!


In [8]:
def create_database():
    conn = sqlite3.connect(DATABASE_NAME)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS articles (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT,
            date TEXT,
            url TEXT UNIQUE,
            source TEXT
        )
    """)
    conn.commit()
    conn.close()
    print("‚úÖ Banco e tabela prontos")

create_database()

‚úÖ Banco e tabela prontos


In [9]:
def insert_article(title, date, url, source="Data Privacy Brasil"):
    conn = sqlite3.connect(DATABASE_NAME)
    cursor = conn.cursor()
    try:
        cursor.execute("""
            INSERT INTO articles (title, date, url, source)
            VALUES (?, ?, ?, ?)
        """, (title, date, url, source))
        conn.commit()
        return True
    except sqlite3.IntegrityError:
        return False
    finally:
        conn.close()

In [13]:
inserted_count = 0
page = 1
max_failures = 3
failures = 0

# Set para detectar duplicatas e parar loop infinito
seen_urls = set()

while True:
    # URL: A primeira p√°gina √© diferente ou igual?
    # O site pode ignorar /page/X/ e retornar sempre a home.
    if page == 1:
        url = "https://dataprivacy.com.br/category/noticias/"
    else:
        url = BASE_URL_TEMPLATE.format(page)
        
    print(f"Coletando p√°gina {page}: {url}")
    
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        
        if r.status_code == 404:
            print("  P√°gina n√£o encontrada (404). Fim da raspagem.")
            break
        
        soup = BeautifulSoup(r.text, "html.parser")
        
        articles = soup.find_all("article")
        if not articles:
             articles = soup.select(".post, .entry, .card")
        
        print(f"  {len(articles)} not√≠cias encontradas")
        
        if len(articles) == 0:
            print("  Nenhum artigo encontrado.")
            break
            
        # Verifica duplicidade na p√°gina inteira
        # Se todos os artigos desta p√°gina j√° foram vistos, estamos em loop
        page_new_urls = 0

        for article in articles:
            # T√≠tulo
            title_tag = article.select_one("h2.entry-title, h3.entry-title, h2 a, h3 a")
            if not title_tag:
                 title_tag = article.find("h2") or article.find("h3")
            
            if not title_tag:
                continue
                
            title = title_tag.get_text(strip=True)
            
            # Link
            link_tag = article.find("a")
            if title_tag.name == 'a':
                link_tag = title_tag
            
            link = link_tag['href'] if link_tag else ""
            
            # Checa duplicata na sess√£o atual
            if link in seen_urls:
                continue
            
            seen_urls.add(link)
            page_new_urls += 1
            
            # Data
            date_tag = article.select_one("time, .date, .post-date, .entry-date, .published")
            date = ""
            if date_tag:
                date = date_tag.get_text(strip=True)
            
            if not date:
                text = article.get_text()
                match = re.search(r'\d{2}[./]\d{2}[./]\d{4}', text)
                if match:
                    date = match.group(0)
            
            if insert_article(title, date, link):
                inserted_count += 1
        
        # Se n√£o achou NENHUM URL novo nesta p√°gina, aborta
        if page_new_urls == 0:
            print("‚ö†Ô∏è Todos os artigos desta p√°gina j√° foram lidos. Loop detectado ou fim da pagina√ß√£o real.")
            break
        
        page += 1
        time.sleep(1)
        
    except Exception as e:
        print(f"Erro na p√°gina {page}: {e}")
        break

print(f"\nüì• Total de novas not√≠cias inseridas: {inserted_count}")

Coletando p√°gina 1: https://dataprivacy.com.br/category/noticias/
  32 not√≠cias encontradas
Coletando p√°gina 2: https://dataprivacy.com.br/category/noticias/page/2/
  32 not√≠cias encontradas
‚ö†Ô∏è Todos os artigos desta p√°gina j√° foram lidos. Loop detectado ou fim da pagina√ß√£o real.

üì• Total de novas not√≠cias inseridas: 0


In [11]:
def load_articles():
    conn = sqlite3.connect(DATABASE_NAME)
    df = pd.read_sql("SELECT * FROM articles", conn)
    conn.close()
    return df

df_db = load_articles()
print(f"üì¶ Total no banco: {len(df_db)} registros")
display(df_db.head())

üì¶ Total no banco: 32 registros


Unnamed: 0,id,title,date,url,source
0,1,Lan√ßamento | Document√°rio ‚ÄúIA com Direitos: in...,09.06.2025,https://dataprivacy.com.br/lancamento-document...,Data Privacy Brasil
1,2,ANPD decide primeiro recurso no caso INSS: ent...,26.07.2024,https://dataprivacy.com.br/anpd-decide-primeir...,Data Privacy Brasil
2,3,"Seus dados, minha IA",01.07.2024,https://dataprivacy.com.br/mudanca-termos-de-s...,Data Privacy Brasil
3,4,Inscri√ß√µes abertas para o Privacy Day Summit 2024,19.01.2024,https://dataprivacy.com.br/inscricoes-abertas-...,Data Privacy Brasil
4,5,Data Privacy Brasil envia √† ANPD sugest√µes par...,22.10.2021,https://dataprivacy.com.br/data-privacy-brasil...,Data Privacy Brasil


In [12]:
if not df_db.empty:
    # 1. Timeline (simples contagem por data string)
    date_count = df_db['date'].value_counts().reset_index()
    date_count.columns = ['Date', 'Count']
    date_count = date_count.sort_values('Date')
    
    fig1 = px.line(date_count, x='Date', y='Count', title='Frequ√™ncia de Not√≠cias por Data (String)')
    fig1.show()
    
    # 2. Nuvem de Palavras (Treemap)
    text = ' '.join(df_db['title'].astype(str)).lower()
    words = re.findall(r'\b\w{4,}\b', text)
    
    stopwords = {'para', 'sobre', 'como', 'pela', 'pelo', 'est√°', 'ser√°', 'entre', 'nesta'}
    words = [w for w in words if w not in stopwords]
    
    wc = pd.Series(words).value_counts().head(30).reset_index()
    wc.columns = ['Word', 'Frequency']
    
    fig2 = px.treemap(wc, path=['Word'], values='Frequency', title='Palavras mais frequentes nos t√≠tulos')
    fig2.show()
else:
    print("Sem dados para visualizar.")