In [None]:
import requests
from bs4 import BeautifulSoup, Comment
import time
import base64
from urllib.parse import urlparse

In [None]:
categories = [
    "https://web.archive.org/web/20241212103517/https://nhipsongthoidai.vn/category/marketing/",
    "https://web.archive.org/web/20241107135336/https://nhipsongthoidai.vn/category/marketing/page/2/",
    "https://web.archive.org/web/20241016051900/https://nhipsongthoidai.vn/category/marketing/page/3/",
    "https://web.archive.org/web/20240913224204/https://nhipsongthoidai.vn/category/tong-hop/",
    "https://web.archive.org/web/20241107130529/https://nhipsongthoidai.vn/category/tong-hop/page/2/",
    "https://web.archive.org/web/20241107134744/https://nhipsongthoidai.vn/category/tong-hop/page/3/",
    "https://web.archive.org/web/20241107142016/https://nhipsongthoidai.vn/category/tong-hop/page/4/",
    "https://web.archive.org/web/20241107123023/https://nhipsongthoidai.vn/category/tong-hop/page/5/",
    "https://web.archive.org/web/20230926224332/https://nhipsongthoidai.vn/category/ban-can-biet/",
    "https://web.archive.org/web/20230201120859/https://nhipsongthoidai.vn/category/ban-can-biet/page/2/",
    "https://web.archive.org/web/20230201123159/https://nhipsongthoidai.vn/category/ban-can-biet/page/3/",
    "https://web.archive.org/web/20230201125530/https://nhipsongthoidai.vn/category/ban-can-biet/page/4/",
    "https://web.archive.org/web/20230201131656/https://nhipsongthoidai.vn/category/ban-can-biet/page/5/",
    "https://web.archive.org/web/20230321020647/https://nhipsongthoidai.vn/category/khai-thac-su-dung/"
]

In [None]:
post_links = [
    a['href']
    for url in categories
    for a in BeautifulSoup(requests.get(url).text, 'html.parser').select('div.post-item a.plain')
]

In [None]:
post_links

In [None]:
url = post_links[0]

response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
soup

In [None]:
og_title_tag = soup.find("meta", property="og:title")

In [None]:
og_title_tag

In [None]:
og_title = og_title_tag["content"] if og_title_tag else None

In [None]:
og_title

In [None]:
featured_img = soup.find("meta", property="og:image")
featured_img

In [None]:
featured_img['content']

In [None]:
content_div = soup.select_one("div.entry-content.single-page")
content_div

In [None]:
for div in content_div.find_all("div", class_=["code-block", "blog-share"]):
    div.decompose()

In [None]:
content_div

In [None]:
for p in content_div.find_all("p"):
    if p.find("script"):
        p.decompose()

In [None]:
content_div

In [None]:
for comment in content_div.find_all(string=lambda text: isinstance(text, Comment)):
    comment.extract()

In [None]:
content_div

In [None]:
content_inner_html = content_div.decode_contents()

In [None]:
content_inner_html

In [None]:
def scrape_post_data(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    content_title = soup.select_one("h1.entry-title")
    og_title_tag = soup.find("meta", property="og:title")
    title = content_title.get_text(strip=True) if content_title else og_title_tag["content"]

    og_url_tag = soup.find("meta", property="og:url")
    source_url = og_url_tag["content"] if og_url_tag else url

    og_image_tag = soup.find("meta", property="og:image")
    featured_img = og_image_tag["content"] if og_image_tag else None

    content_div = soup.select_one("div.entry-content.single-page")

    for div in content_div.find_all("div", class_=["code-block", "blog-share"]):
        div.decompose()
    
    for p in content_div.find_all("p"):
        if p.find("script"):
            p.decompose()

    for comment in content_div.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()

    content_html = content_div.decode_contents()
    clean_html = content_html.replace("\n", "")
    clean_html = clean_html.replace("\xa0", " ")

    return {
        "title": title,
        "featured_img": featured_img,
        "content_html": clean_html,
        "url": source_url
    }

In [None]:
def safe_scrape(url):
    try:
        time.sleep(10)
        return scrape_post_data(url)
    except Exception as e:
        return {"error": str(e), "url": url}

In [None]:
test = list(map(safe_scrape, post_links))

In [None]:
test

In [None]:
WP_URL_POST = "https://nhipsongthoidai.vn/wp-json/wp/v2/posts"
WP_URL_MEDIA = "https://nhipsongthoidai.vn/wp-json/wp/v2/media"
WP_URL = "https://nhipsongthoidai.vn"
USERNAME = "nstt"
PASSWORD = ""

In [None]:
token = base64.b64encode(f"{USERNAME}:{PASSWORD}".encode())
headers = {'Authorization': f'Basic {token.decode("utf-8")}'}

In [None]:
def upload_image(image_url, for_content=False):
    print(f"Uploading image: {image_url}")
    try:    
        image_data = requests.get(image_url, timeout=120).content
        filename = image_url.split("/")[-1]
        files = {'file': (filename, image_data)}
        response = requests.post(WP_URL_MEDIA, headers=headers, files=files)
        if response.status_code == 201:
            json_data = response.json()
            if for_content:
                return json_data['source_url']
            else:
                return json_data['id']
    except Exception as e:
        print(f"Error uploading image {image_url}: {e}")
    time.sleep(10)
    return None


In [None]:
def process_content_html(content_html):
    soup = BeautifulSoup(content_html, "html.parser")

    for img_tag in soup.find_all("img"):
        img_url = img_tag.get("data-src") or img_tag.get("src")
        if img_url:
            wp_url = upload_image(img_url, for_content=True)
            if wp_url:
                img_tag["src"] = wp_url
                for attr in ["data-src", "data-srcset", "srcset"]:
                    img_tag.attrs.pop(attr, None)
            else:
                figure_tag = img_tag.find_parent("figure")
                if figure_tag:
                    figure_tag.decompose()
                else:
                    img_tag.decompose()
        time.sleep(10)

    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        if href.startswith("https://web.archive.org/web/"):
            parts = href.split("/", 5)
            if len(parts) >= 6:
                real_url = parts[5]
                if not real_url.startswith("http"):
                    real_url = "https://" + real_url
                a_tag["href"] = real_url

    return str(soup)

In [None]:
def extract_slug_from_url(url):
    real_url = url.split("https://")[-1] 
    real_url = "https://" + real_url

    path = urlparse(real_url).path

    slug = path.strip("/").split("/")[-1]

    return slug

In [None]:
def create_post(title, content, image_url, category_id, source_url=None):
    image_id = upload_image(image_url, for_content=False)

    slug = None
    if source_url:
        slug = extract_slug_from_url(source_url)

    post_data = {
        "title": title,
        "content": content,
        "status": "publish",
        "categories": [category_id]
    }
    if image_id:
        post_data["featured_media"] = image_id
    if slug:
        post_data["slug"] = slug 

    response = requests.post(WP_URL_POST, headers=headers, json=post_data)
    if response.status_code == 201:
        print(f"Post '{title}' created successfully with slug '{slug}'!\n")
        return True
    else:
        print(f"Failed to create post '{title}': {response.status_code}, {response.text}\n")
        return False

In [None]:
for item in test:
    if item.get("error"):
        print(f"Skipped {item['url']} due to error: {item['error']}")
        continue
    
    content = process_content_html(item["content_html"])
    create_post(
        title=item["title"],
        content=content,
        image_url=item["featured_img"],
        category_id=2,
        source_url=item["url"]
    )
    time.sleep(10)