In [None]:
from ScraperHelper.Scrape import *
import pandas as pd

In [None]:
helper = driverHelper(logging_path="scrape.log")
helper.addOptions(
    arguments=["--incognito"],
    page_load_strategy="eager"
)
helper.forceCreateDriver( vpn_provider="nordvpn" )

In [None]:
with open("data/vietnamese-stopwords.txt", "r") as f:
    stopwords = sorted(list(set(f.read().split())))

In [None]:
repeat_idx = []
for idx, word in enumerate(stopwords[:-1]):
    if word in stopwords[idx+1]:
        repeat_idx.append(idx)

# 1. Scrape search results

In [None]:
helper.forceCreateDriver( vpn_provider="nordvpn" )

In [None]:
search_url = "https://vtv.vn/tim-kiem.htm?keywords={}&page={}"
words_in_title = []
for idx,word in enumerate(stopwords):
    if idx in repeat_idx:
        continue
    words_in_title.append(word)

In [None]:
all_articles = []
scraped_words = set()
scraped_link = set()

In [None]:
for word in words_in_title:
    if word in scraped_words: continue

    logging.info(f'Scraping search word {word}')
    page=1
    recent_article_link = ""
    while True:
        logging.info(f'Scraping page {page}')
        helper.forceGet(
            url=search_url.format( word, page),
            error_message_in_page=["403 Forbidden"]
            )
        
        logging.info("Getting articles info")
        scraped_articles = [
            {
                "link":helper.forceFindElement(By.TAG_NAME, "a",element_as_finder=ele).get_attribute("href"), # link
                "title":helper.forceFindElement(By.TAG_NAME, "a",element_as_finder=ele).get_attribute("title").strip(), # title
                "datetime":helper.forceFindElement(By.CLASS_NAME, "time",element_as_finder=ele).get_attribute("innerHTML").strip(), # datetime
                "description":helper.forceFindElement(By.CLASS_NAME, "sapo",element_as_finder=ele).get_attribute("innerHTML").strip(), #description
            }
            for ele in 
            helper.driver.find_elements(By.XPATH,"""//*[@id="SearchSolr1"]/li[*]""")
            # find all articles in a search page
            # each element found represents an article
        ]

        if len(scraped_articles) == 0:
            logging.warning(f"There aren't any article when searching for keyword {word}")
            break

        if recent_article_link == scraped_articles[0]["link"]:
            logging.info(f"Done scraping article from search word {word}")
            break
        else:
            recent_article_link = scraped_articles[0]["link"]

        for article in scraped_articles:
            if article["link"] not in scraped_link:
                all_articles.append(article)

        if page % 3 == 0:  
            pd.DataFrame(all_articles).to_feather("data/vtv_articles_2.feather")
        page+=1
    scraped_words.add(word)
    pd.DataFrame(all_articles).to_feather("data/vtv_articles_2.feather")

# 2. Scrape each article

In [None]:
index_link = list(
    pd.read_feather("data/vtv_articles_2.feather")
        ["link"].to_dict().items()
)

In [None]:
all_articles_contents = []
scraped_index = set()

In [None]:
element_options = {
    "basic":{
        "contents":"""//*[@id="entry-body"]""",
            # .find_elements(By.TAG_NAME, "p")
        "tags":"""//*[@id="admWrapsite"]/div[3]/div[2]/div[2]/div[3]/div[1]/div[1]/div[9]""",
            # .find_elements(By.TAG_NAME,'''a''')
        "author":"""//*[@id="admWrapsite"]/div[3]/div[2]/div[2]/div[3]/div[1]/div[1]/p"""
            # .get_attribute("innerHTML").split("<span")[0]
    },
    "sports":{
        "contents":"""//*[@id="entry-body"]""",
            # .find_elements(By.TAG_NAME, "p")
        "tags":"""//*[@id="admWrapsite"]/div[3]/div[2]/div/div[3]/div[1]/div[1]/div[8]""",
            # .find_elements(By.TAG_NAME,'''a''')
        "author":"""//*[@id="admWrapsite"]/div[3]/div[2]/div/div[3]/div[1]/div[1]/p/b"""
    },
    "health":{
        "contents":"""//*[@id="Main"]/div/div/div/div[2]/div[2]/div[1]/div[1]/div[3]""",
            # .find_elements(By.TAG_NAME, "p"):
        "tags":"""//*[@id="Main"]/div/div/div/div[2]/div[2]/div[1]/div[1]/div[5]""",
            # .find_elements(By.TAG_NAME, "a")
        "author":"""//*[@id="Main"]/div/div/div/div[2]/div[2]/div[1]/div[1]/div[1]/div[1]/span"""
    }
}


In [None]:
for article_idx, article_link in index_link:
    logging.info(f"ARTICLE_{article_idx}: {article_link}")
    helper.forceGet(
        article_link,
        try_refresh_before_retry=True
        )
    tmp_info = {
        "contents":"<Error>",
        "tags":"<Error>",
        "author":"<Error>"
    }
    for i in range(3):
        found_page_type=False
        for page_type, elements in element_options.items():
            try:
                tmp_info["tags"] = [
                    tag.text for tag in 
                    helper.driver.find_element(By.XPATH, elements["tags"])
                                .find_elements(By.TAG_NAME,"a")
                    ]
                found_page_type = True
            except helper.element_exception as err:
                logging.warning(f"This page is not of type {page_type}")
                continue
            if page_type == "basic":
                print(page_type)
                tmp_info["author"] = (helper
                    .forceFindElement(By.XPATH, elements["author"])
                    .get_attribute("innerHTML")
                    .split("<span")[0]
                )
            else:
                print(page_type)
                tmp_info["author"] = (helper
                    .forceFindElement(By.XPATH,elements["author"])
                    .text
                )
            tmp_info["contents"] = [
                content_element.text for content_element in
                helper.driver.find_elements(By.XPATH, elements["contents"])
            ]
            break
        if found_page_type:
            break



    