Import Libraries

In [15]:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
from requests.auth import HTTPBasicAuth 

Global Variables

In [None]:
category_url = "https://phoiviet.com/news/83/hen-suyen.html"
base_url = "https://phoiviet.com"

wp_url_posts = "https://phatdat.h2tdev.io.vn/wp-json/wp/v2/posts"
wp_url_media = "https://phatdat.h2tdev.io.vn/wp-json/wp/v2/media"
username = "admin"
password = ""
auth = HTTPBasicAuth(username, password)

links = []
all_posts = []
media_ids = []

Get Total Pages

In [17]:
response = requests.get(category_url)
soup = BeautifulSoup(response.content, 'html.parser')

pages = soup.select('.pagination a[title]')
page_numbers = [int(a['title']) for a in pages if a['title'].isdigit()]
max_page = max(page_numbers) if page_numbers else 1

print(f"Number of pages found: {max_page}")

Number of pages found: 2


Get All Post Links

In [18]:
for page in range(1, max_page + 1):
    res = requests.get(f"{category_url}?page={page}")
    if res.status_code != 200:
        continue

    soup = BeautifulSoup(res.content, 'html.parser')
    for a in soup.select('.blog-head h3 a'):
        href = a.get('href')
        if href and href.startswith('/new/'):
            full_link = base_url + href
            if full_link not in links:
                links.append(full_link)

print(f"Total posts found: {len(links)}")

Total posts found: 18


Crawl Post

In [19]:
for idx, source_url in enumerate(links, start=1):
    print(f"\nFetching post {idx}: {source_url}")
    response = requests.get(source_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title_element = soup.select_one('title')
    title = title_element.get_text(strip=True) if title_element else f"No title {idx}"

    content_element = soup.select_one('body > div.main-wrapper > div:nth-child(13) > div > div')
    content = ""
    if content_element:
        for img in content_element.find_all('img'):
            src = img.get('src')
            if src and src.startswith('/'):
                img['src'] = urljoin(base_url, src)
        content = content_element.decode_contents()

    feature_image_url = ""
    feature_img_tag = soup.select_one('img.max-image')
    if feature_img_tag:
        img_src = feature_img_tag.get('src')
        if img_src:
            feature_image_url = urljoin(base_url, img_src)

    all_posts.append({
        "title": title,
        "content": content,
        "feature_image_url": feature_image_url
    })

    print(f"Finished fetching post {idx}: {title}")
    print(f"Featured image: {feature_image_url[:60]}...")


Fetching post 1: https://phoiviet.com/new/143/hen-suyen.html
Finished fetching post 1: Hen suyễn
Featured image: https://phoiviet.com/tmp/cache/images/_thumbs/870x405//uploa...

Fetching post 2: https://phoiviet.com/new/655/lua-chon-dung-thuoc-de-dieu-tri-hen-suyen-ca-the.html
Finished fetching post 2: Lựa chọn đúng thuốc để điều trị hen suyễn cá thể
Featured image: https://phoiviet.com/tmp/cache/images/_thumbs/870x405//uploa...

Fetching post 3: https://phoiviet.com/new/565/hen-tre-em.html
Finished fetching post 3: Hen suyễn trẻ em
Featured image: https://phoiviet.com/tmp/cache/images/_thumbs/870x405//uploa...

Fetching post 4: https://phoiviet.com/new/643/lam-sao-de-nhan-biet-hen-suyen-tre-em.html
Finished fetching post 4: Làm sao để nhận biết hen suyễn trẻ em?
Featured image: https://phoiviet.com/tmp/cache/images/_thumbs/870x405//uploa...

Fetching post 5: https://phoiviet.com/new/505/viem-mui-di-ung.html
Finished fetching post 5: Viêm mũi dị ứng
Featured image: https://phoiviet.co

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Finished fetching post 18: Lựa chọn máy phun khí dung phù hợp
Featured image: https://phoiviet.com/tmp/cache/images/_thumbs/870x405//uploa...


Upload feature image

In [20]:
for idx, post in enumerate(all_posts, start=1):
    feature_image_url = post.get("feature_image_url")
    media_id = None

    if feature_image_url:
        try:
            image_data = requests.get(feature_image_url).content
            image_name = feature_image_url.split("/")[-1]
            headers_media = {
                "Content-Disposition": f"attachment; filename={image_name}",
                "Content-Type": "image/jpeg"
            }
            media_response = requests.post(wp_url_media, auth=auth, headers=headers_media, data=image_data)
            if media_response.status_code == 201:
                media_id = media_response.json().get("id")
                print(f"Post {idx}: Featured image uploaded successfully! Media ID {media_id}")
            else:
                print(f"Post {idx}: Failed to upload featured image", media_response.text)
        except Exception as e:
            print(f"Post {idx}: Error uploading featured image:", e)

    media_ids.append(media_id)

Post 1: Featured image uploaded successfully! Media ID 207
Post 2: Featured image uploaded successfully! Media ID 208
Post 3: Featured image uploaded successfully! Media ID 209
Post 4: Featured image uploaded successfully! Media ID 210
Post 5: Featured image uploaded successfully! Media ID 211
Post 6: Featured image uploaded successfully! Media ID 212
Post 7: Featured image uploaded successfully! Media ID 213
Post 8: Featured image uploaded successfully! Media ID 214
Post 9: Featured image uploaded successfully! Media ID 215
Post 10: Featured image uploaded successfully! Media ID 216
Post 11: Featured image uploaded successfully! Media ID 217
Post 12: Featured image uploaded successfully! Media ID 218
Post 13: Featured image uploaded successfully! Media ID 219
Post 14: Featured image uploaded successfully! Media ID 220
Post 15: Featured image uploaded successfully! Media ID 221
Post 16: Featured image uploaded successfully! Media ID 222
Post 17: Featured image uploaded successfully! Me

Add Post

In [21]:
for idx, post in enumerate(all_posts, start=1):
    title = post.get("title")
    content = post.get("content")
    media_id = media_ids[idx-1]

    if not title or not content:
        print(f"Skipping post {idx}: missing title or content")
        continue

    post_data = {
        "title": title,
        "content": content,
        "status": "publish"
    }
    if media_id:
        post_data["featured_media"] = media_id

    try:
        post_response = requests.post(wp_url_posts, auth=auth, json=post_data)
        if post_response.status_code == 201:
            print(f"Post {idx} created successfully!")
            print("Post ID:", post_response.json().get("id"))
            print("Link:", post_response.json().get("link"))
        else:
            print(f"Post {idx}: Failed to create post")
            print("Status Code:", post_response.status_code)
            print("Response:", post_response.text[:200])
    except Exception as e:
        print(f"Post {idx}: Error creating post:", e)

Post 1 created successfully!
Post ID: 225
Link: https://phatdat.h2tdev.io.vn/hen-suyen/
Post 2 created successfully!
Post ID: 226
Link: https://phatdat.h2tdev.io.vn/lua-chon-dung-thuoc-de-dieu-tri-hen-suyen-ca-the/
Post 3 created successfully!
Post ID: 227
Link: https://phatdat.h2tdev.io.vn/hen-suyen-tre-em/
Post 4 created successfully!
Post ID: 228
Link: https://phatdat.h2tdev.io.vn/lam-sao-de-nhan-biet-hen-suyen-tre-em/
Post 5 created successfully!
Post ID: 229
Link: https://phatdat.h2tdev.io.vn/viem-mui-di-ung/
Post 6 created successfully!
Post ID: 230
Link: https://phatdat.h2tdev.io.vn/do-feno-de-lam-gi/
Post 7 created successfully!
Post ID: 231
Link: https://phatdat.h2tdev.io.vn/dieu-tri-bang-thuoc-sinh-hoc-giai-phap-moi-trong-dieu-tri-hen-nang/
Post 8 created successfully!
Post ID: 232
Link: https://phatdat.h2tdev.io.vn/lam-the-nao-de-chan-doan-hen-chinh-xac/
Post 9 created successfully!
Post ID: 233
Link: https://phatdat.h2tdev.io.vn/ca-the-hoa-dieu-tri-hen-suyen/
Post 10 create