# SCRAPING WAR HISTORY ONLINE NEWS
### 2023.12.13

In [12]:
from bs4 import BeautifulSoup, NavigableString
from urllib.parse import urljoin
from tqdm import tqdm
import requests
import pandas as pd
import csv

In [None]:
base_url = 'https://www.warhistoryonline.com/category/news/page/'

In [18]:
#<p>태그 내에 <a>태그 사이의 단어 띄어쓰기 적용 함수
def get_text_with_spaces(tag):
    parts = []
    for element in tag.descendants:
        if isinstance(element, NavigableString):
            parts.append(element)
        elif element.name == 'a':
            if parts and not parts[-1].endswith(' '):
                parts.append(' ')
            parts.append(element.get_text())
            parts.append(' ')
    return ''.join(parts)

#뉴스 스크랩 후 csv파일로 저장 함수
def scrape_news(start_page, end_page, file_name):
    
    with open(file_name, 'w', newline='', encoding='utf-8-sig') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Text'])

        for page_num in tqdm(range(start_page, end_page + 1), desc="Processing pages"):
            try:
                url = f"{base_url}{page_num}"
                res = requests.get(url)
                res.raise_for_status()
                soup = BeautifulSoup(res.text, 'lxml')
                html_links = soup.select('h3.entry-title a[href$=".html"]')

                for link_tag in tqdm(html_links, desc= f"Processing links on {page_num}"):
                    link = link_tag['href']
                    title = link_tag.get_text(strip=True)

                    article_res = requests.get(link)
                    article_res.raise_for_status()
                    article_soup = BeautifulSoup(article_res.text, 'lxml')

                    date_tag = article_soup.select_one('time')
                    date = date_tag.get_text(strip=True) if date_tag else 'No Date'

                    footer_p_tags = {p for footer in article_soup.find_all('footer') for p in footer.find_all('p')}
                    p_texts = [get_text_with_spaces(p) for p in article_soup.find_all('p') if p not in footer_p_tags]
                    all_text = ' '.join(p_texts)

                    writer.writerow([title, date, all_text])
            
            except requests.RequestException as e:
                print(f"Request error on page {page_num}: {e}")
            except Exception as e:
                print(f"Unexpected error on page {page_num}: {e}")

In [None]:
#Action
scrape_news(12, 15, 'war_news_text.csv')