In [2]:
# a list of urls (names are ignored)
# all urls must be in the same language (en/he)
wp_urls_lang = 'en'
wp_urls_text = '''
Eldad Hadani

https://www.bh.org.il/eldad-hadani/

Father Mahari

https://www.bh.org.il/father-mahari/

Joseph Halévy

https://www.bh.org.il/joseph-halevy/

Zimne Barhana

https://www.bh.org.il/46299-2/
'''

In [3]:
wp_urls = [line.strip() for line in wp_urls_text.split("\n") if line.strip().startswith('http')]
wp_urls

['https://www.bh.org.il/eldad-hadani/',
 'https://www.bh.org.il/father-mahari/',
 'https://www.bh.org.il/joseph-halevy/',
 'https://www.bh.org.il/46299-2/']

In [6]:
import requests
from pyquery import PyQuery as pq
from dataflows import Flow, printer, dump_to_path

def get_items():
    for wp_url in wp_urls:
        res = requests.get(wp_url)
        doc = pq(res.content)
        assert doc.attr('lang') == 'en-US' if wp_urls_lang == 'en' else 'he-IL', 'invalid lang: {}'.format(doc.attr('lang'))
        title = doc('h2.title').text()
        if not title:
            title = doc('.side-menu-wrapper .parent').text()
        assert title and len(title) > 5, 'invalid title: "{}"'.format(title)
        description = '.'.join(doc('div.content-column p').text().split('.')[:4])
        assert len(description) > 30, 'invalid description: {}'.format(description)
        imgs = doc('div.content-column img')
        image_url = ('https:'+pq(imgs[0]).attr('src')) if imgs else ''
        yield {'url': wp_url, 'title': title, 'description': description, 'image_url': image_url}

Flow(
    get_items(),
    dump_to_path('.data/ethiopia-personalities'),
    printer(tablefmt='html', num_rows=9999)
).process()

#,url (string),title (string),description (string),image_url (string)
1,https://www.bh.org.il/eldad-hadani/,Eldad Hadani,Hadani was a Jewish merchant and traveler in the 9th century. There is question as to his being an h ...,
2,https://www.bh.org.il/father-mahari/,Father Mahari,"Mahari Sothal , or Mahari Sothal (after his father), better known as Father Mahari , is the name of ...",
3,https://www.bh.org.il/joseph-halevy/,Joseph Halévy,"Professor Joseph Halévy; 1827 – 1917 A Frenchman of Ottoman descent, he served as an educator and te ...",https://www.bh.org.il/wp-content/uploads/Joseph_Halevy-1.jpg
4,https://www.bh.org.il/46299-2/,Zimne Barhana,He was one of the pioneers of the struggle to bring the “Beta Israel” community to the State of Isra ...,https://www.bh.org.il/wp-content/uploads/זימנה_ברהני_באתיופיה_1992_אוסף_הרב_מנחם_ולדמן-1.jpg


(<datapackage.package.Package at 0x7f9dcb584e50>,
 {'count_of_rows': 4,
  'bytes': 4333,
  'hash': '4a2056aff508a52276c0e6c4ef708301',
  'dataset_name': None})

In [8]:
import json
from urllib.parse import quote
from dataflows import Flow, load

ITEM_JSON_HE_TEMPLATE = '''    {
      "UnitType": 8,
      "UnitTypeDesc": "Personality",
      "Header": {
        "He": <<title>>,
        "En": ""
      },
      "UnitText1": {
        "He": <<description>>,
        "En": ""
      },
      "Slug": {
        "He": "",
        "En": ""
      },
      "video_url": null,
      "main_image_url": <<image_url>>,
      "preview_image_url": <<image_url>>,
      "image_urls": [<<image_urls>>],
      "item_url_he": <<url>>,
      "item_url_en": ""
    }, '''

ITEM_JSON_EN_TEMPLATE = '''    {
      "UnitType": 8,
      "UnitTypeDesc": "Personality",
      "Header": {
        "He": "",
        "En": <<title>>
      },
      "UnitText1": {
        "He": "",
        "En": <<description>>
      },
      "Slug": {
        "He": "",
        "En": ""
      },
      "video_url": null,
      "main_image_url": <<image_url>>,
      "preview_image_url": <<image_url>>,
      "image_urls": [<<image_urls>>],
      "item_url_he": "",
      "item_url_en": <<url>>
    }, '''

def print_manual_items_json(row):
    item_json_template = ITEM_JSON_EN_TEMPLATE if wp_urls_lang == 'en' else ITEM_JSON_HE_TEMPLATE
    item_json = item_json_template.replace(
        '<<title>>', json.dumps(row['title'], ensure_ascii=False)
    ).replace(
        '<<description>>', json.dumps(row['description'], ensure_ascii=False)
    ).replace(
        '<<image_url>>', json.dumps(quote(row['image_url']).replace('https%3A', 'https:')) if row['image_url'] else '""'
    ).replace(
        '<<image_urls>>', json.dumps(quote(row['image_url']).replace('https%3A', 'https:')) if row['image_url'] else ''
    ).replace(
        '<<url>>', json.dumps(row['url'])
    )
    print(item_json)

Flow(
    load('.data/ethiopia-personalities/datapackage.json'),
    print_manual_items_json
).process()

    {
      "UnitType": 8,
      "UnitTypeDesc": "Personality",
      "Header": {
        "He": "",
        "En": "Eldad Hadani"
      },
      "UnitText1": {
        "He": "",
        "En": "Hadani was a Jewish merchant and traveler in the 9th century. There is question as to his being an historical personality, that he wrote in Hebrew (with many Arabic influences) his book on the Laws of Kosher animal slaughter, and that he wrote an epistle which is titled in his name. His place of origin is unknown, but in his writings he wrote that he is a citizen of an “Independent Jewish State” in East Africa. He probably meant modern-day Ethiopia"
      },
      "Slug": {
        "He": "",
        "En": ""
      },
      "video_url": null,
      "main_image_url": "",
      "preview_image_url": "",
      "image_urls": [],
      "item_url_he": "",
      "item_url_en": "https://www.bh.org.il/eldad-hadani/"
    }, 
    {
      "UnitType": 8,
      "UnitTypeDesc": "Personality",
      "Header": {
   

(<datapackage.package.Package at 0x7f9dcb1c1cd0>, {})