In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import re

In [16]:
def scrape_page(book_id, page):
    '''Scrape a page from a book in Shamela website: https://shamela.ws'''
    url = f'https://shamela.ws/book/{book_id}/{page}'
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        sentences = soup.select('.nass p')

        return [sentence.get_text() for sentence in sentences]

    except Exception:
        print("Invalid URL:", url)
        return []

In [17]:
books = {
    "التاج في اخلاق الملوك" : 22623,
    'كليلة ودمنة': 26537,
    "البخلاء" : 10501,
    "الرسائل للجاحظ" : 10428,
    "تهافت الفلاسفة" : 11055,
    "أصناف المغرورين" : 9198,
    "بن حزم" : 1038,
    "تشريح القانون": 802,
    "الحيوان": 23775
}
    # "فضائح الباطنية" : 6554,
    # "تبصرة البلدان" : 6956,
    # "الأنواء": 9093,
    # "السياسة": 7227,
    # "الرازي رسائل": 12834,
    # "العلاج بالأعشاب": 10705,
    # "القانون في الطب": 10706,
    # "نشوار المحاضرة": 10275,
    # "تاريخ الطبري": 9783,
    # "فتوح الشام": 12045,
num_threads = len(books.keys())
ar_dataset = {}

In [18]:
def process_book(book_id, max_pages = 500):
    
    res = requests.get(f"https://shamela.ws/book/{book_id}")
    soup = BeautifulSoup(res.content, 'html.parser')
    title = soup.select('h1')[0].get_text().strip()
    
    if title == 'كتاب الحيوان':
        max_pages = 1500
    
    book_sentences = []
    last_sentence = ""
    print(f"Started {book_id}")
    
    for page in range(max_pages):
        sentences = scrape_page(book_id, page)
        if len(sentences) > 0:
            sentences[0] = f"{last_sentence} {sentences[0]}"
            last_sentence = sentences.pop(-1)
        book_sentences.extend(sentences)
        
    book_sentences.append(last_sentence)
    print(f"Finished {book_id}")

    return title, book_sentences

In [19]:
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(process_book, book_id) for book_id in books.values()]
    for future in futures:
        title, data = future.result()
        ar_dataset[title] = data

Started 26537
Started 22623
Started 10501
Started 802
Started 9198
Started 10428
Started 11055Started 1038

Started 23775
Finished 9198
Finished 22623
Finished 26537
Finished 10501
Finished 802
Finished 10428
Finished 11055
Finished 1038
Finished 23775


In [20]:
import json

with open("divided.json", "w") as outfile: 
    json.dump(ar_dataset, outfile, ensure_ascii=False)

In [4]:
pd.DataFrame(ar_dataset[list(ar_dataset.keys())[0]]).sample()

NameError: name 'ar_dataset' is not defined

In [17]:
df = pd.DataFrame(ar_sentences, columns=['arabic'])

In [19]:
df = df.map(lambda text: re.sub(r"«.*?»", '', text))
df = df.map(lambda text: re.sub(r"\s+", ' ', text))

In [20]:
df.sample(2).values

array([['أفادت بك الأيّام فرط تجارب ... كأنّك في فرق الزمان مشيب'],
       ['وكانت لمعتزلة تؤمن بالعقل، كما كان علم الكلام والجدل موضوع كل مجلس، وكل منتدى، فنزع الجاحظ نزعة إعتزالية ... وإذا كان اساتذة عصره قد طبّعوا الجاحظ بميزات فكرية وأدبية ولغوية وعلمية فريدة، فإن المعتزلة تركت آثارها العقلية عميقة في كتاباته، وطرائق تفكيره وتآليفه. وبات له نمط واضح ثابت، يستدلّ عليه من كتبه، ومن موضوعاته.']],
      dtype=object)

In [23]:
df.to_excel("shamela_books.xlsx")

## Sharding

In [22]:
import pandas as pd
import numpy as np

df = pd.read_excel("shamela_books.xlsx")
df = df.drop(columns=['Unnamed: 0'])
df = df.sample(frac=1)

In [24]:
shards = np.array_split(df, 5)

for i, shard in enumerate(shards, 1):
    shard = shard.reset_index(drop=True)
    shard.to_excel(f"shards/shard_{i}.xlsx")

  return bound(*args, **kwds)


## Translate

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
import os

# Configure Chrome options
options = webdriver.ChromeOptions()
download_dir = os.path.join(os.getcwd(), 'translations')  # Use os.path.join for compatibility
prefs = {
    "download.default_directory": download_dir,  # Set default download directory
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True,  # Ensure safe browsing is enabled to avoid download issues
}
options.add_experimental_option("prefs", prefs)

# Initialize the WebDriver
driver = webdriver.Chrome(options=options)

try:
    # Open Google Translate
    driver.get('https://translate.google.com/?sl=ar&tl=en&op=docs')

    # Wait for the file input element to be present and upload the file
    file_input = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//input[@type="file"]'))
    )
    
    file_path = os.path.join(os.getcwd(), 'shamela_books.xlsx')
    file_input.send_keys(file_path)

    sleep(1)  # Allow time for the file to upload

    # # Find and click the translate button
    # translate_button = WebDriverWait(driver, 10).until(
    #     EC.element_to_be_clickable((By.XPATH, '//button[div[text()="Translate"] or @aria-label="Translate"]'))
    # )
    # translate_button.click()

    sleep(15)  # Wait for the translation to complete

    # Find and click the download button
    download_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//button[div[text()="Download translation"] or @aria-label="Download translation"]'))
    )
    download_button.click()

    sleep(5)  # Wait for the download to complete

finally:
    # Quit the driver after completion
    driver.quit()


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=129.0.6668.60)
Stacktrace:
	GetHandleVerifier [0x00007FF7B00FB125+29573]
	(No symbol) [0x00007FF7B006FF50]
	(No symbol) [0x00007FF7AFF2B6EA]
	(No symbol) [0x00007FF7AFEFFCD5]
	(No symbol) [0x00007FF7AFFAEF67]
	(No symbol) [0x00007FF7AFFC7FC1]
	(No symbol) [0x00007FF7AFFA70A3]
	(No symbol) [0x00007FF7AFF712DF]
	(No symbol) [0x00007FF7AFF72441]
	GetHandleVerifier [0x00007FF7B042C76D+3377613]
	GetHandleVerifier [0x00007FF7B0477B67+3685831]
	GetHandleVerifier [0x00007FF7B046CF8B+3641835]
	GetHandleVerifier [0x00007FF7B01BB2A6+816390]
	(No symbol) [0x00007FF7B007B25F]
	(No symbol) [0x00007FF7B0077084]
	(No symbol) [0x00007FF7B0077220]
	(No symbol) [0x00007FF7B006607F]
	BaseThreadInitThunk [0x00007FF8D2437374+20]
	RtlUserThreadStart [0x00007FF8D3C5CC91+33]


## COMBINE

In [26]:
import os

In [29]:
translated_shards = [pd.read_excel(f"translations/{file}") for file in os.listdir("translations")]

In [33]:
translated_shards[0]

Unnamed: 0.1,Unnamed: 0,Arabic
0,0,[Our answer may be in a way that makes the so...
1,1,Some Persian scholars said: Jam continued to ...
2,2,The servant said to him: If nothing would bene...
3,3,120) They call the two months of summer in wh...
4,4,"O Moses, you call upon God and do not ask Him..."
...,...,...
5055,5055,Saeed bin Al-Musayyab did not see any harm in...
5056,5056,"Unfortunately, the book has not reached us in ..."
5057,5057,[This is what we wanted to mention in “Divine...
5058,5058,I said: I still have one need.


In [38]:
for i in range(len(shards)):
    shards[i] = shards[i].reset_index(drop=True)
    shards[i]['English'] = translated_shards[i]["Arabic"]

In [45]:
df = pd.concat(shards).reset_index(drop=True)

In [55]:
df.sample().values

array([['وتفكر في إخلالك بعلمك، فانك لا تعمل بما علمت منه فعلمك عليك حجة حينئذ، لقد كان أسلم لك لو لم تكن عالماً. واعلم ان الجاهل حينئذ أعقل منك وأحسن حالاً وأعذر، فليسقط عجبك بالكلية.',
        ' Think about your failure to use your knowledge, for you do not act upon what you have learned, so your knowledge is an argument against you at that time. It would have been safer for you if you were not knowledgeable. Know that the ignorant person at that time is more intelligent than you, better off, and more excused, so let your arrogance fall completely.']],
      dtype=object)

In [56]:
df.to_csv("./../shamela_dataset.csv")