# Extração de Dados

Nesta seção, dados de mensagens de usuários em um fórum específico são extraídos, organizados e salvos em arquivos CSV. Utilizamos a biblioteca `requests` para acessar a página e `BeautifulSoup` para análise de HTML.

In [None]:
# Importação de bibliotecas
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re

In [None]:
def extract_and_save_posts(url, output_dir="csv/general autism discussion"):
    request = requests.get(url)
    content = request.content

    site = BeautifulSoup(content, 'html.parser')

    posts = site.findAll('div', attrs={'class': 'message-col'})

    user_names = []
    genders = []
    messages = []

    for post in posts:
        # Lógica de extração de usuário e gênero
        user = post.find_previous_sibling('div', attrs={'class': 'user-col'})
        if user:
            user_name = user.find('a', href=True)
            if user_name:
                user_name = user_name.text.strip()
                user_names.append(user_name)

            gender = user.find(string=lambda string: string and 'Gender:' in string)
            if gender:
                gender = gender.split('Gender:')[1].strip()
                genders.append(gender)
            else:
                genders.append(None)

        message_content = post.find('div', attrs={'class': 'message-content'})
        if message_content:
            messages.append(message_content.text.strip())
        else:
            messages.append(None)

    df = pd.DataFrame({
        'Nome do Usuário': user_names,
        'Gênero': genders,
        'Conteúdo da Mensagem': messages
    })

    title = site.find('title').text.strip().replace(" ", "").replace("|", "").replace(":", "")
    title = title.replace("Asperger's_&_Autism_Community_-_Wrong_Planet", "").strip("_")
    title = re.sub(r'[\\/*?:"<>|]', "", title)
    file_name = f"{title}.csv"
    full_name_path = os.path.join(output_dir, file_name)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    df.to_csv(full_name_path, index=False)

In [3]:
links = [
    "https://wrongplanet.net/forums/viewtopic.php?t=28967", 
    "https://wrongplanet.net/forums/viewtopic.php?t=306110", 
    "https://wrongplanet.net/forums/viewtopic.php?t=415040", 
    "https://wrongplanet.net/forums/viewtopic.php?t=422455"
]

for link in links:
    extract_and_save_posts(link)
