In [2]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request


# credits to this stackoverflow answer https://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text

allowed_sections=['style', 'script', 'head', 'title', 'meta', '[document]']


def tag_visible(element):
    if element.parent.name in allowed_sections:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

In [3]:
def text_with_newlines(elem):
    text = ''
    for e in elem.descendants:
        if isinstance(e, str):
            text += e
        elif e.name == 'br' or e.name == 'p':
            text += '\n'
    return text


def parse_ghazal(url):
    html = urllib.request.urlopen(url).read()
    soup= BeautifulSoup(html, 'html.parser')
    mydivs = soup.find("div", {"class": "pMC"})
    
    #this section removes some of the English translations present on the webpage
    # mixing language data would add noise, and make it difficult for the model to learn
    #BUT in future these urdu to english translations could be a valuable resource to prepare machine translation data 
    for div in mydivs.find_all("div", {'class':'t'}): 
        div.decompose()
    
    mydivs= text_with_newlines(mydivs)
    return mydivs

In [4]:
#testing the homepage for parsing all poet names now
#for ghalib

def parse_webpage_at_given_scroll(html):
    ctr=0
    soup= BeautifulSoup(html, 'html.parser')
    mydivs = soup.find("div", {"class": "contentListBody"})
    titles=[]
    for a in mydivs.find_all('a', href=True):
        t=a['href']
        if t not in titles:
            if ctr%5==0:
                print("Found the URL:", t)
            titles.append(t)
            ctr+=1
    print('=============================')    
    print('number of titles',len(titles))
    print('=============================')
    
    return titles

In [6]:
#language argument can be ur or hi for urdu or hindi
import os
def read_and_write_web(author,language='ur'):
    
    lang=language
    #author='mirza-ghalib'
    author_lan=author+'/'+lang
    if not os.path.exists(author_lan):
        os.makedirs(author_lan)

    for url in titles:
        name_poem=url.split('https://www.rekhta.org/ghazals/')[1]
        path_poem= author_lan+'/'+name_poem
        if os.path.exists(path_poem):
            pass
        else:
            f= open(path_poem,"w+")
            if lang=='en':
                    url_for_lang= url
            else:
                url_for_lang= url+'?lang='+lang
            ghazal = parse_ghazal(url_for_lang)
            f.write(ghazal)
            f.close()

In [9]:
def get_all_poets():
    # Base URL for poets index page
    poets_url = 'https://www.rekhta.org/poets'
    
    # Read the HTML from the URL
    try:
        html = urllib.request.urlopen(poets_url).read()
    except Exception as e:
        print(f"Error fetching URL: {e}")
        return []
        
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all poet links
    poets = []
    # The links have the class 'poetName'
    poet_links = soup.find_all('a', class_='poetName')
    
    for link in poet_links:
        # Extract poet name from the 'title' attribute
        poet_name = link.get('title')
        poets.append(poet_name)
    
    print(f"Found {len(poets)} poets on the initial page load.")
    return poets

# Example usage
poet_list = get_all_poets()
print(poet_list)

Found 0 poets on the initial page load.
[]


In [8]:
authors = get_all_poets()


Found 0 poets


In [None]:
##Parsing based on home page of authors
url_base='https://www.rekhta.org/poets/'

## TODO : Later
## or just iterate through the list of all poets on the index, instead of hand curated list
authors=['mirza-ghalib','allama-iqbal','faiz-ahmad-faiz','sahir-ludhianvi','meer-taqi-meer',
         'dagh-dehlvi','kaifi-azmi','gulzar','bahadur-shah-zafar','parveen-shakir',
         'jaan-nisar-akhtar','javed-akhtar','jigar-moradabadi','jaun-eliya',
         'ahmad-faraz','meer-anees','mohsin-naqvi','firaq-gorakhpuri','fahmida-riaz','wali-mohammad-wali',
        'waseem-barelvi','akbar-allahabadi','altaf-hussain-hali','ameer-khusrau','naji-shakir','naseer-turabi'
        ,'nazm-tabatabai','nida-fazli','noon-meem-rashid','habib-jalib']



for author in authors:
    url_home_page= url_base +author+ '/ghazals'
    html = urllib.request.urlopen(url_home_page).read()
    titles= parse_webpage_at_given_scroll(html)
    read_and_write_web(author,'en')
    read_and_write_web(author,'ur')
    read_and_write_web(author,'hi')