# WEB CRAWLING

## Step 1: Create browser

In [1]:
import os
import requests
import time

from tqdm import tqdm
from selenium import webdriver 
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

In [2]:
chrome_option = Options()
chrome_option.add_argument('--headless=new')
chrome_option.add_argument('--no-sandbox')
chrome_option.add_argument('--disable-dev-shm-usage')

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_option)

## Step 2: Create empty folder

In [3]:
root_dir = 'vn_news_corpus'
os.makedirs(root_dir, exist_ok=True)

n_pages = 2 # So trang chua danh sach bai bao muon truy cap
article_id = 0 # Ma de luu cac bai bao

## Step 3: Get list of articles + Step4: Access to article contents

In [4]:
for page_idx in range(n_pages):
    # Truy cap vao tung trang 
    main_url = f'https://vietnamnet.vn/thoi-su-page{page_idx}'
    # Gui yeu cau truy cap vao trang thứ i
    driver.get(main_url)

    # Tao duong dan de truy cap vao danh sach 15 bai bao cua i trang
    news_lst_xpath = '//div[@class ="topStory-15nd"]/div/div[1]/a' 

    # Tao list de luu toan bo duong dan den tag:'a' cua trang thu i
    news_tags = driver.find_elements(By.XPATH, news_lst_xpath)

    #Tao list de luu toan bo URL cua moi bai bao cua trang thu i
    news_page_urls= [new_tag.get_attribute('href') for new_tag in news_tags]


    ## Step 4: Access to article contents
    
    for news_page_url in news_page_urls:
        # Truy cap vao bai bao cu the
        driver.get(news_page_url)
        time.sleep(1)

    # Tao duong dan den main content tag
        main_content_xpath ='//div[@class="content-detail content-detail-type-1 content-mobile-change"]'
        try:
            main_content_tag = driver.find_element(By.XPATH, main_content_xpath)
        except:
            continue
        video_content_xpath = '//div[@class="video-detail"]'
        try:
            video_content_tag = driver.find_element(By.XPATH, video_content_xpath)
            continue
        except:
            pass
        # Lay chu de cua tung bai bao
        title = main_content_tag.find_element(By.XPATH, './/h1').text.strip()
        
        # Lay ban tom tat cua tung bai bao
        abstract = main_content_tag.find_element(By.XPATH, './/h2').text.strip()
        
        # Lay ten tac gia cua bai bao
        try:
            author_xpath = '//span[@class="name"]'
            author =main_content_tag.find_element(By.XPATH, author_xpath).text.strip()
        except:
            author = ''

        # Lay doan van 
        pragraphs_xpath = '//div[@class="maincontent main-content"]/p'
        pragraphs_tags = main_content_tag.find_elements(By.XPATH, pragraphs_xpath)

        pragraphs_lst = [pragraphs_tag.text.strip() for pragraphs_tag in pragraphs_tags]

        pragraphs = ' '.join(pragraphs_lst)
        

        final_content_lst = [title, abstract, pragraphs, author]
        final_content = '\n\n'.join(final_content_lst)

        # Luu toan bo thong tin bai bao vao file goc
        article_filename = f'article_{article_id:05d}.txt'
        article_savepath = os.path.join(root_dir, article_filename)
        print(article_savepath)

        #Tang ma len khi tao xong 1 file text
        article_id += 1

        # Mo file va ghi noi dung bai bao vao file txt
        with open(article_savepath,'w',encoding='utf8') as fp :
            fp.write(final_content)

        driver.back()     

vn_news_corpus\article_00000.txt
vn_news_corpus\article_00001.txt
vn_news_corpus\article_00002.txt
vn_news_corpus\article_00003.txt
vn_news_corpus\article_00004.txt
vn_news_corpus\article_00005.txt
vn_news_corpus\article_00006.txt
vn_news_corpus\article_00007.txt
vn_news_corpus\article_00008.txt
vn_news_corpus\article_00009.txt
vn_news_corpus\article_00010.txt
vn_news_corpus\article_00011.txt
vn_news_corpus\article_00012.txt
vn_news_corpus\article_00013.txt
vn_news_corpus\article_00014.txt
vn_news_corpus\article_00015.txt
vn_news_corpus\article_00016.txt
vn_news_corpus\article_00017.txt
vn_news_corpus\article_00018.txt
vn_news_corpus\article_00019.txt
vn_news_corpus\article_00020.txt
vn_news_corpus\article_00021.txt
vn_news_corpus\article_00022.txt
vn_news_corpus\article_00023.txt
vn_news_corpus\article_00024.txt
vn_news_corpus\article_00025.txt
vn_news_corpus\article_00026.txt
vn_news_corpus\article_00027.txt
vn_news_corpus\article_00028.txt
vn_news_corpus\article_00029.txt


## Nen file da thu duoc thong tin vao

In [5]:
import shutil
shutil.make_archive("vn_news_corpus", "zip", "vn_news_corpus")

'd:\\CODEPYTHON\\Course_AI\\MODULE1\\Data_manupulation\\Data_Crawling\\vn_news_corpus.zip'