# Kurmanji web crawler (Kurawler)

## ANFNews

In [1]:
import os
from bs4 import BeautifulSoup
import re
from urllib.request import Request, urlopen
import random
import requests

In [2]:
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,} 

def url_exists(url):
    request = requests.get(url)
    return request.status_code == 200

def download_url(url, path, overwrite=False):
    if not os.path.exists(path) or overwrite:
        if not url_exists(url):
            print("Url doesn't exist:", url)
            return 'END'
        
        print("Downloading", url)
        request = Request(url,None,headers) #The assembled request
        response = urlopen(request)
        page_content = response.read() # The data u need
        
        with open(path, 'wb') as f:
            f.write(page_content)
    else:
        with open(path, 'rb') as f:
            page_content = f.read()
    return page_content

def parse_articles(page_content, base_url):
    soup = BeautifulSoup(page_content)
    
    articlelist = soup.findAll("h2", {"class": "artic-title"})

    article_info = []

    for article in articlelist:
        link = article.a['href']
        title = article.find_next().text

        article_info.append((base_url + link, title))

    return article_info

In [3]:
#Download directory
downdir = "urls"
if not os.path.exists(downdir):
    os.makedirs(downdir)

#ANFNews structure
topics = {0:'haberler-17-judeoespanyol'}

articles = {}
total_article_count = 0

baseurl = 'https://www.salom.com.tr/'

#Download and process all
for topic_key in topics:
    topic_base_url = baseurl + topics[topic_key] + "?page="
    topic_downdir = os.path.join(downdir, topics[topic_key])
    
    if not os.path.exists(topic_downdir):
        os.makedirs(topic_downdir)
        
    articles = []

    page_no = 1

    while(True):
        article_list_url = topic_base_url + str(page_no)

        article_list_html_path = os.path.join(topic_downdir, topics[topic_key] + "_" + str(page_no) + ".html")

        print(article_list_url)
        try:    
            article_list_html_content = download_url(article_list_url, article_list_html_path)
        except:
            print("Error downloading", article_list_url)
            break

        if article_list_html_content == 'END':
            break
            
        page_articles = parse_articles(article_list_html_content, baseurl)
        
        page_article_count = len(page_articles)
      
        if page_article_count == 0:
            break
        
        articles.extend(page_articles)

        total_article_count += page_article_count

        #get next page
        page_no += 1
        
    print("# pages", page_no)
    print("# articles", len(articles))

https://www.salom.com.tr/haberler-17-judeoespanyol?page=1
https://www.salom.com.tr/haberler-17-judeoespanyol?page=2
https://www.salom.com.tr/haberler-17-judeoespanyol?page=3
https://www.salom.com.tr/haberler-17-judeoespanyol?page=4
https://www.salom.com.tr/haberler-17-judeoespanyol?page=5
https://www.salom.com.tr/haberler-17-judeoespanyol?page=6
https://www.salom.com.tr/haberler-17-judeoespanyol?page=7
https://www.salom.com.tr/haberler-17-judeoespanyol?page=8
https://www.salom.com.tr/haberler-17-judeoespanyol?page=9
https://www.salom.com.tr/haberler-17-judeoespanyol?page=10
https://www.salom.com.tr/haberler-17-judeoespanyol?page=11
https://www.salom.com.tr/haberler-17-judeoespanyol?page=12
https://www.salom.com.tr/haberler-17-judeoespanyol?page=13
https://www.salom.com.tr/haberler-17-judeoespanyol?page=14
https://www.salom.com.tr/haberler-17-judeoespanyol?page=15
https://www.salom.com.tr/haberler-17-judeoespanyol?page=16
# pages 16
# articles 300


In [None]:
articles

In [10]:
def clean_title(text):
    clean_text = text.replace("\n", "").replace("\t", "").strip()
    return clean_text
    
def clean_content_text(text):
    text = text.replace(u'\xa0', u' ')
    text = text.replace(u'\t', u' ')
    text = text.strip()
    return text

def parse_article_text(page_content, title):
    soup = BeautifulSoup(page_content)
    
    text_segments = [title]
    
    result = soup.find("div", {"class":"col-md-12 mbot15 hicerikdty"})
    if result:
        for pelem in result.findAll("p"):
            text_segments.append(pelem.text.strip())
    else:
        print("ERROR: No text found")
    
    return text_segments

In [12]:
#Download directory
pagedowndir = "articles/page"
if not os.path.exists(pagedowndir):
    os.makedirs(pagedowndir)
    
textdir = "articles/text"
if not os.path.exists(textdir):
    os.makedirs(textdir)
    
all_text_segments = []
    
# total_article_count = 0

# for page_no in range(1,len(articles)+1):
#     print(page_no)
#     page_subdir_html = os.path.join(pagedowndir, str(page_no))
#     page_subdir_txt = os.path.join(textdir, str(page_no))

#     if not os.path.exists(page_subdir_html):
#         os.makedirs(page_subdir_html)

#     if not os.path.exists(page_subdir_txt):
#         os.makedirs(page_subdir_txt)

article_no = 0
for article_link, article_title in articles:
    article_no += 1
    article_id = article_link.split("/")[-1][:-5]

    html_path = os.path.join(pagedowndir, "{:02d}".format(article_no) + "-" + str(article_id) + ".html")
    txt_path = os.path.join(textdir, "{:02d}".format(article_no) + "-" + str(article_id) + ".txt")

    print(article_link)
    print(article_id)
    print(html_path)

    try:   
        article_html_content = download_url(article_link, html_path)
    except:
        print("Can't process...", article_link)
        continue

    article_text_segments = parse_article_text(article_html_content, article_title)

    all_text_segments.extend(article_text_segments)

    with open(txt_path, 'w') as f:
        f.write("\n".join(article_text_segments))

https://www.salom.com.tr/haber-120892-pas_en_el_payis_pas_en_el_mundo.html
haber-120892-pas_en_el_payis_pas_en_el_mundo
articles/page/01-haber-120892-pas_en_el_payis_pas_en_el_mundo.html
https://www.salom.com.tr/haber-120891-un__viaje__a__hamburg.html
haber-120891-un__viaje__a__hamburg
articles/page/02-haber-120891-un__viaje__a__hamburg.html
Downloading https://www.salom.com.tr/haber-120891-un__viaje__a__hamburg.html
https://www.salom.com.tr/haber-120890-bo.html
haber-120890-bo
articles/page/03-haber-120890-bo.html
Downloading https://www.salom.com.tr/haber-120890-bo.html
https://www.salom.com.tr/haber-120889-prediksiones_por_el_mijor_filmo_en_los_premios_oscar_2022.html
haber-120889-prediksiones_por_el_mijor_filmo_en_los_premios_oscar_2022
articles/page/04-haber-120889-prediksiones_por_el_mijor_filmo_en_los_premios_oscar_2022.html
Downloading https://www.salom.com.tr/haber-120889-prediksiones_por_el_mijor_filmo_en_los_premios_oscar_2022.html
https://www.salom.com.tr/haber-120888-el_pu

Downloading https://www.salom.com.tr/haber-120609-auguros_de_hanuka_de_la_ambasada_de_espanya.html
https://www.salom.com.tr/haber-120608-mizmor_shir_hanukat_abayit_ledavid.html
haber-120608-mizmor_shir_hanukat_abayit_ledavid
articles/page/34-haber-120608-mizmor_shir_hanukat_abayit_ledavid.html
Downloading https://www.salom.com.tr/haber-120608-mizmor_shir_hanukat_abayit_ledavid.html
https://www.salom.com.tr/haber-120607-lo_ke_mos_ambeza_la_natura.html
haber-120607-lo_ke_mos_ambeza_la_natura
articles/page/35-haber-120607-lo_ke_mos_ambeza_la_natura.html
Downloading https://www.salom.com.tr/haber-120607-lo_ke_mos_ambeza_la_natura.html
https://www.salom.com.tr/haber-120606-west_side_story_i_hanuka.html
haber-120606-west_side_story_i_hanuka
articles/page/36-haber-120606-west_side_story_i_hanuka.html
Downloading https://www.salom.com.tr/haber-120606-west_side_story_i_hanuka.html
https://www.salom.com.tr/haber-120605-fiesta_de_hanuka_en_balat.html
haber-120605-fiesta_de_hanuka_en_balat
article

Downloading https://www.salom.com.tr/haber-120248-toledot.html
https://www.salom.com.tr/haber-120247-una_data_muy_importante_en_la_istorya_del_antisemitizmo_un_dokumento_kontra_el_antisemitizmo.html
haber-120247-una_data_muy_importante_en_la_istorya_del_antisemitizmo_un_dokumento_kontra_el_antisemitizmo
articles/page/66-haber-120247-una_data_muy_importante_en_la_istorya_del_antisemitizmo_un_dokumento_kontra_el_antisemitizmo.html
Downloading https://www.salom.com.tr/haber-120247-una_data_muy_importante_en_la_istorya_del_antisemitizmo_un_dokumento_kontra_el_antisemitizmo.html
https://www.salom.com.tr/haber-120246-la__salud.html
haber-120246-la__salud
articles/page/67-haber-120246-la__salud.html
Downloading https://www.salom.com.tr/haber-120246-la__salud.html
https://www.salom.com.tr/haber-120245-mueva_teknika_para_kurar_una_grande_depresion.html
haber-120245-mueva_teknika_para_kurar_una_grande_depresion
articles/page/68-haber-120245-mueva_teknika_para_kurar_una_grande_depresion.html
Down

Downloading https://www.salom.com.tr/haber-119874-un_muevo_sentro_de_bushkedades_i_una_bibloteka_en_el_muzeo_djudio.html
https://www.salom.com.tr/haber-119873-sukot_en_ankara.html
haber-119873-sukot_en_ankara
articles/page/96-haber-119873-sukot_en_ankara.html
Downloading https://www.salom.com.tr/haber-119873-sukot_en_ankara.html
https://www.salom.com.tr/haber-119872-bereshit.html
haber-119872-bereshit
articles/page/97-haber-119872-bereshit.html
Downloading https://www.salom.com.tr/haber-119872-bereshit.html
https://www.salom.com.tr/haber-119871-cem_mansur_i_los_konsertos_de_crr.html
haber-119871-cem_mansur_i_los_konsertos_de_crr
articles/page/98-haber-119871-cem_mansur_i_los_konsertos_de_crr.html
Downloading https://www.salom.com.tr/haber-119871-cem_mansur_i_los_konsertos_de_crr.html
https://www.salom.com.tr/haber-119870-una_boda_eksepsyonala.html
haber-119870-una_boda_eksepsyonala
articles/page/99-haber-119870-una_boda_eksepsyonala.html
Downloading https://www.salom.com.tr/haber-11987

Downloading https://www.salom.com.tr/haber-119608-el_umoristo_shalom_aleichem.html
https://www.salom.com.tr/haber-119548-ke_dia_de_alhad.html
haber-119548-ke_dia_de_alhad
articles/page/129-haber-119548-ke_dia_de_alhad.html
Downloading https://www.salom.com.tr/haber-119548-ke_dia_de_alhad.html
https://www.salom.com.tr/haber-119547-ki_tavo.html
haber-119547-ki_tavo
articles/page/130-haber-119547-ki_tavo.html
Downloading https://www.salom.com.tr/haber-119547-ki_tavo.html
https://www.salom.com.tr/haber-119546-no_los_djudios_no_dirijan_no_reynan_i_no_governan_el_mundo.html
haber-119546-no_los_djudios_no_dirijan_no_reynan_i_no_governan_el_mundo
articles/page/131-haber-119546-no_los_djudios_no_dirijan_no_reynan_i_no_governan_el_mundo.html
Downloading https://www.salom.com.tr/haber-119546-no_los_djudios_no_dirijan_no_reynan_i_no_governan_el_mundo.html
https://www.salom.com.tr/haber-119545-el_anyo_mas_negro_del_mundo_536.html
haber-119545-el_anyo_mas_negro_del_mundo_536
articles/page/132-haber-

Downloading https://www.salom.com.tr/haber-119265-un__enverano__sin__agua__i__sin__mar.html
https://www.salom.com.tr/haber-119264-ekev.html
haber-119264-ekev
articles/page/160-haber-119264-ekev.html
Downloading https://www.salom.com.tr/haber-119264-ekev.html
https://www.salom.com.tr/haber-119262-saver__es__bueno___j_e_s_s_i_c_a___m_e_i_r.html
haber-119262-saver__es__bueno___j_e_s_s_i_c_a___m_e_i_r
articles/page/161-haber-119262-saver__es__bueno___j_e_s_s_i_c_a___m_e_i_r.html
Downloading https://www.salom.com.tr/haber-119262-saver__es__bueno___j_e_s_s_i_c_a___m_e_i_r.html
https://www.salom.com.tr/haber-119261-un_filantropisto_famozo_sir_moses_montefiore.html
haber-119261-un_filantropisto_famozo_sir_moses_montefiore
articles/page/162-haber-119261-un_filantropisto_famozo_sir_moses_montefiore.html
Downloading https://www.salom.com.tr/haber-119261-un_filantropisto_famozo_sir_moses_montefiore.html
https://www.salom.com.tr/haber-119260-biontech.html
haber-119260-biontech
articles/page/163-hab

Downloading https://www.salom.com.tr/haber-118964-rey_de_shabat.html
https://www.salom.com.tr/haber-118963-i_la_moda_kontiene_elementos_de_arte__tuvo_enteres_en_el_arte_tambien_i_pint_tabls.html
haber-118963-i_la_moda_kontiene_elementos_de_arte__tuvo_enteres_en_el_arte_tambien_i_pint_tabls
articles/page/192-haber-118963-i_la_moda_kontiene_elementos_de_arte__tuvo_enteres_en_el_arte_tambien_i_pint_tabls.html
Downloading https://www.salom.com.tr/haber-118963-i_la_moda_kontiene_elementos_de_arte__tuvo_enteres_en_el_arte_tambien_i_pint_tabls.html
https://www.salom.com.tr/haber-118962-superstisyones.html
haber-118962-superstisyones
articles/page/193-haber-118962-superstisyones.html
Downloading https://www.salom.com.tr/haber-118962-superstisyones.html
https://www.salom.com.tr/haber-118961-komo_pudyeron_los_sefaradis_konservar_esta_lingua_mas_de_kinyentos_anyos.html
haber-118961-komo_pudyeron_los_sefaradis_konservar_esta_lingua_mas_de_kinyentos_anyos
articles/page/194-haber-118961-komo_pudyero

https://www.salom.com.tr/haber-118574-naso.html
haber-118574-naso
articles/page/222-haber-118574-naso.html
Downloading https://www.salom.com.tr/haber-118574-naso.html
https://www.salom.com.tr/haber-118573-einstein_i_el_djudaizmo.html
haber-118573-einstein_i_el_djudaizmo
articles/page/223-haber-118573-einstein_i_el_djudaizmo.html
Downloading https://www.salom.com.tr/haber-118573-einstein_i_el_djudaizmo.html
https://www.salom.com.tr/haber-118572-el__senso_i_la__emportansa__del__dia_de__las__madres.html
haber-118572-el__senso_i_la__emportansa__del__dia_de__las__madres
articles/page/224-haber-118572-el__senso_i_la__emportansa__del__dia_de__las__madres.html
Downloading https://www.salom.com.tr/haber-118572-el__senso_i_la__emportansa__del__dia_de__las__madres.html
https://www.salom.com.tr/haber-118457-komidas_djudias_de_edirne.html
haber-118457-komidas_djudias_de_edirne
articles/page/225-haber-118457-komidas_djudias_de_edirne.html
Downloading https://www.salom.com.tr/haber-118457-komidas_dju

Downloading https://www.salom.com.tr/haber-118204-trenta_sigundos_sovre_la_istoria_del_djudeoespanyol.html
https://www.salom.com.tr/haber-118203-el_problem_del_get_en_israel.html
haber-118203-el_problem_del_get_en_israel
articles/page/253-haber-118203-el_problem_del_get_en_israel.html
Downloading https://www.salom.com.tr/haber-118203-el_problem_del_get_en_israel.html
https://www.salom.com.tr/haber-118142-danube__tuna_boyunca_claudio_magris_estoria_de_un_rio__bir_nehrin_hikyesi_a_lo_largo_del_danubio.html
haber-118142-danube__tuna_boyunca_claudio_magris_estoria_de_un_rio__bir_nehrin_hikyesi_a_lo_largo_del_danubio
articles/page/254-haber-118142-danube__tuna_boyunca_claudio_magris_estoria_de_un_rio__bir_nehrin_hikyesi_a_lo_largo_del_danubio.html
Downloading https://www.salom.com.tr/haber-118142-danube__tuna_boyunca_claudio_magris_estoria_de_un_rio__bir_nehrin_hikyesi_a_lo_largo_del_danubio.html
https://www.salom.com.tr/haber-118141-un_otro_artikolo_sovre_la_corona.html
haber-118141-un_otr

Downloading https://www.salom.com.tr/haber-117894-pesah_aserkandose_una_istorya_verdadera_sovre_una_agada.html
https://www.salom.com.tr/haber-117893-madam_fIlIba.html
haber-117893-madam_fIlIba
articles/page/280-haber-117893-madam_fIlIba.html
Downloading https://www.salom.com.tr/haber-117893-madam_fIlIba.html
https://www.salom.com.tr/haber-117892-un_torniko_en_los_rekuerdos__2.html
haber-117892-un_torniko_en_los_rekuerdos__2
articles/page/281-haber-117892-un_torniko_en_los_rekuerdos__2.html
Downloading https://www.salom.com.tr/haber-117892-un_torniko_en_los_rekuerdos__2.html
https://www.salom.com.tr/haber-117838-la_ventana_de_vuestra_ermana.html
haber-117838-la_ventana_de_vuestra_ermana
articles/page/282-haber-117838-la_ventana_de_vuestra_ermana.html
Downloading https://www.salom.com.tr/haber-117838-la_ventana_de_vuestra_ermana.html
https://www.salom.com.tr/haber-117837-vayakelpekude.html
haber-117837-vayakelpekude
articles/page/283-haber-117837-vayakelpekude.html
Downloading https://ww

In [None]:
#Write out a text corpus with all segments
text_corpus_path = "Salom-ladino-2022-01.txt"
with open(text_corpus_path, 'w') as f: 
    f.write("\n".join(all_text_segments))

article_html_content = download_url("https://anfkurdi.com/kurdistan/ji-kcdk-e-ye-li-diji-qeyuman-banga-berxwedane-68448", 
                                    "article.html", overwrite=True)

article_text = parse_article_text(article_html_content)
