In [None]:
from ScraperHelper.Scrape import *
import pandas as pd
import polars as pl
import subprocess

In [None]:
helper = driverHelper(logging_path="scrape.log")
helper.addOptions(
    arguments=["--incognito"],
    page_load_strategy="eager"
)
helper.forceCreateDriver( vpn_provider="nordvpn" )

In [None]:
with open("data/vietnamese-stopwords.txt", "r") as f:
    stopwords = sorted(list(set(f.read().split())))

In [None]:
repeat_idx = []
for idx, word in enumerate(stopwords[:-1]):
    if word in stopwords[idx+1]:
        repeat_idx.append(idx)

# 1. Scrape search results

In [None]:
helper.forceCreateDriver( vpn_provider="nordvpn" )

In [None]:
search_url = "https://vtv.vn/tim-kiem.htm?keywords={}&page={}"
words_in_title = []
for idx,word in enumerate(stopwords):
    if idx in repeat_idx:
        continue
    words_in_title.append(word)

In [None]:
all_articles = []
scraped_words = set()
scraped_link = set()

In [None]:
for word in words_in_title:
    if word in scraped_words: continue

    logging.info(f'Scraping search word {word}')
    page=1
    recent_article_link = ""
    while True:
        logging.info(f'Scraping page {page}')
        helper.forceGet(
            url=search_url.format( word, page),
            error_message_in_page=["403 Forbidden"]
            )
        
        logging.info("Getting articles info")
        scraped_articles = [
            {
                "link":helper.forceFindElement(By.TAG_NAME, "a",element_as_finder=ele).get_attribute("href"), # link
                "title":helper.forceFindElement(By.TAG_NAME, "a",element_as_finder=ele).get_attribute("title").strip(), # title
                "datetime":helper.forceFindElement(By.CLASS_NAME, "time",element_as_finder=ele).get_attribute("innerHTML").strip(), # datetime
                "description":helper.forceFindElement(By.CLASS_NAME, "sapo",element_as_finder=ele).get_attribute("innerHTML").strip(), #description
            }
            for ele in 
            helper.driver.find_elements(By.XPATH,"""//*[@id="SearchSolr1"]/li[*]""")
            # find all articles in a search page
            # each element found represents an article
        ]

        if len(scraped_articles) == 0:
            logging.warning(f"There aren't any article when searching for keyword {word}")
            break

        if recent_article_link == scraped_articles[0]["link"]:
            logging.info(f"Done scraping article from search word {word}")
            break
        else:
            recent_article_link = scraped_articles[0]["link"]

        for article in scraped_articles:
            if article["link"] not in scraped_link:
                all_articles.append(article)

        if page % 3 == 0:  
            pd.DataFrame(all_articles).to_feather("data/vtv_articles_2.feather")
        page+=1
    scraped_words.add(word)
    pd.DataFrame(all_articles).to_feather("data/vtv_articles_2.feather")

# 2. Scrape each article

In [None]:
index_link = list(
    pd.read_feather("data/vtv_articles.feather")
        ["link"].to_dict().items()
)

In [None]:
all_articles_contents = []
scraped_index = set()

In [None]:
element = {
    "contents" : [
        """//*[@id="entry-body"]/p[*]""",
        """//*[@id="Main"]/div/div/div/div[2]/div[2]/div[1]/div[1]/div[3]/p[*]""",
        """//*[@id="form1"]/div[2]/div[3]/div/div/div[2]/div/p[*]""",
        """//*[@id="divNewsContent"]/div/div/p[*]"""
    ],
    "tags" : [
        "news_keyword",
        "new-tags",
        "tags",
        "tag",
    ]
}


In [None]:
[
    i.text for i in 
    helper.driver.find_elements(By.XPATH, """//*[@id="entry-body"]/p[*]""")]

In [None]:
class get_value():
    def __init__(self,driver) -> None:
        self.driver = driver

    def get_author_from_news_info(self):
        return (
            self.driver.find_element(By.CLASS_NAME, "news-info")
            .get_attribute("innerHTML")
            .replace("<b>","")
            .split("</b>")[0]
            .strip()
        )
    def get_author_from_author(self):
        return (
            self.driver.find_element(By.CLASS_NAME, "author")
            .get_attribute("innerHTML")
            .split("<span")[0]
            .strip()
        )
    def get_author_from_xpath(self):
        return (helper.driver
                .find_element(By.XPATH, """//*[@id="form1"]/div[2]/div[3]/div/div/div[2]/div/p[27]/b""")
                .text
                .replace("Bài viết: ","")
            )
    def get_author_from_vtv8(self):
        return (helper.driver
                .find_element(By.XPATH,"""//*[@id="admWrapsite"]/div/div[3]/div[3]/div[4]/div/div[1]/p/b""")
                .text
                .strip())
    
    def get_tags(self):
        for path in element["tags"]:
            try:
                return [
                    tag.text for tag in 
                    self.driver.find_element(By.CLASS_NAME, path)
                                .find_elements(By.TAG_NAME,"a")
                ] 
            except helper.element_exception:
                continue
        raise NoSuchElementException("Cant find tags")
    
    def get_contents(self):
        contents = []
        last_err = None
        for path in element["contents"]:
            try:
                self.driver.find_element(By.XPATH, path)
                contents.extend([
                    content_element.text
                    for content_element in
                    self.driver.find_elements(By.XPATH, path)
                ])
            except helper.element_exception as err:
                last_err = err
                continue
        if len(contents) == 0 :
            raise NoSuchElementException(f"Cant find contents {last_err}")
        return contents
    def format(self,values:list):
        return "-|||-".join(
            map(str,values)
        )
    
    def auto_get_value(self, value_type):
        last_err = None
        if value_type == "author":
            for _ in range(5):
                for func in [
                    self.get_author_from_author,
                    self.get_author_from_news_info,
                    self.get_author_from_xpath,
                    self.get_author_from_vtv8 ]:
                    try:
                        return func()
                    except helper.element_exception as err:
                        last_err = err
                        continue
                time.sleep(1)
        elif value_type == "tags":
            for _ in range(5):
                try:
                    return self.format(self.get_tags())
                except NoSuchElementException as err:
                    last_err = err
                time.sleep(1)
        elif value_type == "contents":
            for _ in range(5):
                try:
                    return self.format(self.get_contents())
                except NoSuchElementException as err:
                    last_err = err
                time.sleep(1)

        return f"<Error>Cant Find Element: {last_err}"

In [None]:
def connectVPN():
    logging.info(
        "\n".join([
            line for line in 
            subprocess.run("nordvpn c".split(), capture_output=True)
            .stdout.decode("utf8").split("\n")
            if len(line.strip()) > 4
        ])
    )

In [None]:
helper.normalCreateDriver()

In [None]:
for article_idx, article_link in index_link:

    if article_idx in scraped_index:
        continue
    
    custom_error_message = None
    if article_idx % 100 == 0 and article_idx != 0:
        connectVPN()
        helper.reopenDriver(reconnect_vpn=True)
        pl.DataFrame(all_articles_contents
                     ).write_ipc("data/articles_content.arrow")
        
    logging.info(f"ARTICLE_{article_idx}")
    

    helper.forceGet(
        article_link,
        try_refresh_before_retry=True,
        error_message_in_page=custom_error_message
        )

    get_value_handler = get_value(helper.driver)

    tmp_info = {
        "contents":"<Error>",
        "tags":"<Error>",
        "author":"<Error>"
    }
    for k in tmp_info:
        if k == "contents" and article_link.startswith("https://vtv.vn/video-dac-sac"):
            tmp_info[k] = helper.driver.find_element(By.XPATH, "./html/body").text
        else:       
            tmp_info[k] = get_value_handler.auto_get_value(
                value_type=k
            )

        if tmp_info[k].startswith("<Error>"):
            logging.error(f"Cant scrape {k}")

    tmp_info["link"] = article_link
    whole_website = '<Contents Not Blank>'

    if tmp_info["contents"].startswith("<Error>"):
        logging.warning("Cant scrape contents")
        try:
            whole_website = helper.driver.find_element(
                By.XPATH,"./html/body").text
            logging.info(whole_website)
        except Exception as e:
            whole_website = "<Cant get page body either>"
            logging.error("Cant get page body either:")
            logging.error(e)
    
    tmp_info["whole_website"] = whole_website
    all_articles_contents.append(tmp_info)
    scraped_index.add(article_idx)
    


In [None]:
test_out = pl.DataFrame(all_articles_contents)