# Crawling Ukrain&Russia war news from ALJAZEERA
### 2023.12.19

In [1]:
from selenium import webdriver # webdriver를 이용해 해당 브라우저를 열기 위해
from selenium.webdriver import ActionChains # 일련의 작업들을(ex.아이디 입력, 비밀번호 입력, 로그인 버튼 클릭...) 연속적으로 실행할 수 있게 하기 위해
from selenium.webdriver.common.keys import Keys # 키보드 입력을 할 수 있게 하기 위해
from selenium.webdriver.common.by import By # html요소 탐색을 할 수 있게 하기 위해
from selenium.webdriver.support.ui import WebDriverWait # 브라우저의 응답을 기다릴 수 있게 하기 위해
from selenium.webdriver.support import expected_conditions as EC # html요소의 상태를 체크할 수 있게 하기 위해
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
from tqdm import tqdm
import time
import csv


In [10]:
url = 'https://www.aljazeera.com/tag/ukraine-russia-crisis/'
res = requests.get(url) #요청 정상 작동 확인
res

<Response [200]>

In [4]:
def open_webpage(url):
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(5)
    print("URL open")
    return driver


def extract_article_data(link, collected_links):
    if link in collected_links:
        return None
    collected_links.add(link)

    response = requests.get(link)
    if response.status_code != 200:
        return None

    article_soup = BeautifulSoup(response.text, 'html.parser')
    
    # 제목과 날짜 태그가 있는지 확인하고 추출
    header = article_soup.find('header', class_='article-header')
    if header:
        title = header.find('h1').get_text(strip=True) if header.find('h1') else 'No Title'
    else:
        title = 'No Title'

    date_container = article_soup.find('div', class_='article-dates')
    if date_container:
        date = date_container.find('span', {'aria-hidden': 'true'}).get_text(strip=True) if date_container.find('span', {'aria-hidden': 'true'}) else 'No Date'
    else:
        date = 'No Date'

    paragraphs = [element.get_text() for element in article_soup.find_all('p')]
    return {'Title': title, 'Date': date, 'Text': ' '.join(paragraphs)}


def crawl_articles(driver):
    collected_links = set()
    articles_data = []

    with tqdm(desc='Processing') as progress_bar:
        while True:
            try:
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, 'html.parser')
                h3_tags = soup.find_all('h3', class_='gc__title')
                links = [urljoin('https://www.aljazeera.com', a['href']) for h3 in h3_tags for a in h3.find_all('a', href=True)]

                for link in links:
                    article_data = extract_article_data(link, collected_links)
                    if article_data:
                        articles_data.append(article_data)
                        progress_bar.update(1)

                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(5)

                more_button = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'show-more-button'))
                )
                driver.execute_script("arguments[0].click();", more_button)
                progress_bar.update(1)  # '더 보기' 버튼 클릭 시 카운트 증가
            except Exception as e:
                print(f"Error occurred: {e}")
                save_to_csv(articles_data, 'collected_data.csv')  # 에러 발생 시 현재까지 수집한 데이터 저장
                break

    return articles_data

    
def save_to_csv(articles_data, filename):
    # 파일을 열고 CSV 작성기를 초기화합니다.
    with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
        writer = csv.writer(file)

        # CSV 파일의 헤더를 작성합니다.
        writer.writerow(['Title', 'Date', 'Text'])

        # 기사 데이터를 행별로 작성합니다.
        for article in articles_data:
            writer.writerow([article['Title'], article['Date'], article['Text']])

    return articles_data

In [5]:
# ACTION
url = 'https://www.aljazeera.com/tag/ukraine-russia-crisis/'
driver = open_webpage(url)
articles_data = crawl_articles(driver)  # 함수로부터 반환된 데이터를 articles_data 변수에 저장
driver.quit()

# 파일 이름을 지정하고 CSV 파일로 저장합니다.
filename = 'Aljazeera_war_news.csv'
save_to_csv(articles_data, filename)

URL open


Processing: 378it [11:56,  1.90s/it]


Error occurred: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=120.0.6099.111)
Stacktrace:
	GetHandleVerifier [0x00007FF7A8082142+3514994]
	(No symbol) [0x00007FF7A7CA0CE2]
	(No symbol) [0x00007FF7A7B476AA]
	(No symbol) [0x00007FF7A7B20AFD]
	(No symbol) [0x00007FF7A7BBCB1B]
	(No symbol) [0x00007FF7A7BD218F]
	(No symbol) [0x00007FF7A7BB5D93]
	(No symbol) [0x00007FF7A7B84BDC]
	(No symbol) [0x00007FF7A7B85C64]
	GetHandleVerifier [0x00007FF7A80AE16B+3695259]
	GetHandleVerifier [0x00007FF7A8106737+4057191]
	GetHandleVerifier [0x00007FF7A80FE4E3+4023827]
	GetHandleVerifier [0x00007FF7A7DD04F9+689705]
	(No symbol) [0x00007FF7A7CAC048]
	(No symbol) [0x00007FF7A7CA8044]
	(No symbol) [0x00007FF7A7CA81C9]
	(No symbol) [0x00007FF7A7C988C4]
	BaseThreadInitThunk [0x00007FFCBE93257D+29]
	RtlUserThreadStart [0x00007FFCBFBCAA58+40]



[{'Title': 'Putin’s confidence upstages Zelenskyy as Ukraine faces uncertain 2024',
  'Date': '21 Dec 2023',
  'Text': 'Russia’s Putin unveils a record war chest as Ukraine’s Zelenskyy struggles to keep his alliance together. Russia’s Vladimir Putin has committed to spending a post-Cold War record $157bn fighting Ukraine and securing Russia next year – a 70 percent increase on this year’s defence budget. But Ukraine’s Volodymyr Zelenskyy has failed to secure $61.4bn from the US and $76.6bn from the European Union, among his staunchest allies, riven by internal disagreements over spending. During lengthy press conferences over the past several days about the wartime issues facing them both, the confidence exuded by Russia’s president clearly outshone the mere faith expressed by Ukraine’s. A Ukrainian summer counteroffensive that petered out without significant territorial gains has divided allied generals, as Russian forces have in recent weeks crept forward on the eastern front, puttin

In [6]:
import os
os.getcwd()

'c:\\Users\\david\\Desktop\\대학원\\war_news_project'